16-R-DataCamp-Exploratory-Data-Analysis-in-R

16-R-DataCamp-Exploratory-Data-Analysis-in-R

1. Exploring Categorical Data

1.1 Exploring categorical data (video)
1.2 Bar chart expectations
1.3 Contingency table review

Instruction:

# Print the first rows of the data
comics

# Check levels of align
levels(comics$align)

# Check the levels of gender
levels(comics$gender)

# Create a 2-way contingency table
table(comics$align, comics$gender)
1.4 Dropping levels

Instruction:

# Load dplyr
library(dplyr)

# Print tab
tab

# Remove align level
comics_filtered <- comics %>%
  filter(align != "Reformed Criminals") %>%
  droplevels()

# See the result
comics_filtered
1.5 Side-by-side barcharts

Instruction:

# Load ggplot2
library(ggplot2)

# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) + 
  geom_bar(position = "dodge")

# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) + 
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 90))
1.6 Bar chart interpretation
1.7 Counts vs proportions (video)
1.8 Conditional proportions
1.9 Counts vs proportions (2)

Instruction:

# Plot of gender by align
ggplot(comics, aes(x = align, fill = gender)) +
  geom_bar()
  
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) + 
  geom_bar(position = "fill") +
  ylab("proportion")
1.10 Distribution of one variable (video)
1.11 Marginal barchart

Instruction:

# Change the order of the levels in align
comics$align <- factor(comics$align, 
                       levels = c("Bad", "Neutral", "Good"))

# Create plot of align
ggplot(comics, aes(x = align)) + 
  geom_bar()
1.12 Conditional barchart

Instruction :

# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) + 
  geom_bar() +
  facet_wrap(~ gender)
1.3 Improve piechart

Instruction:

# Put levels of flavor in descending order
lev <- c("apple", "key lime", "boston creme", "blueberry", "cherry", "pumpkin", "strawberry")
pies$flavor <- factor(pies$flavor, levels = lev)

# Create barchart of flavor
ggplot(pies, aes(x = flavor)) + 
  geom_bar(fill = "chartreuse") + 
  theme(axis.text.x = element_text(angle = 90))

2. Exploring Numerical Data

2.1 Exploring numerical data (video)
2.2 Faceted histogram

Instruction:

# Load package
library(ggplot2)

# Learn data structure
str(cars)

# Create faceted histogram
ggplot(cars, aes(x = city_mpg)) +
  geom_histogram() +
  facet_wrap(~ suv)
2.3 Boxplots and density plots

Instruction:

# Filter cars with 4, 6, 8 cylinders
common_cyl <- cars %>% filter(ncyl %in% c(4,6,8))

# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
  geom_boxplot()

# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
  geom_density(alpha = .3)
2.4 Compare distribution via plots
2.5 Distribution of one variable (video)
2.6 Marginal and conditional histograms

Instruction:

# Create hist of horsepwr
cars %>%
  ggplot(aes(x = horsepwr)) +
  geom_histogram() +
  ggtitle("hist of horsepwr")

# Create hist of horsepwr for affordable cars
cars %>% 
  filter(msrp < 25000) %>%
  ggplot(aes(x = horsepwr)) +
  geom_histogram() +
  xlim(c(90, 550)) +
  ggtitle("hist of horsepwr for affordable cars")
2.7 Marginal and conditional histograms interpretation
2.8 Three binwidths

Instruction:

# Create hist of horsepwr with binwidth of 3
cars %>%
  ggplot(aes(x = horsepwr)) +
  geom_histogram(binwidth = 3) +
  ggtitle("hist of horsepwr with binwidth of 3")

# Create hist of horsepwr with binwidth of 30
cars %>%
  ggplot(aes(x = horsepwr)) +
  geom_histogram(binwidth = 30) +
  ggtitle("hist of horsepwr with binwidth of 30")

# Create hist of horsepwr with binwidth of 60
cars %>%
  ggplot(aes(x = horsepwr)) +
  geom_histogram(binwidth = 60) +
  ggtitle("hist of horsepwr with binwidth of 60")
2.9 Three binwidths interpretation
2.10 Box plots (video)
2.11 Box plots for outliers

Instruction 1:

# Construct box plot of msrp
cars %>%
  ggplot(aes(x = 1, y = msrp)) +
  geom_boxplot()

# Exclude outliers from data
cars_no_out <- cars %>%
  filter(msrp < 100000)

# Construct box plot of msrp using the reduced dataset
cars_no_out %>%
  ggplot(aes(x = 1, y = msrp)) +
  geom_boxplot()
2.12 Plot selection

Instruction:

# Create plot of city_mpg
cars %>%
  ggplot(aes(x = 1 ,y = city_mpg)) +
  geom_boxplot()

# Create plot of width
cars %>% 
  ggplot(aes(x = width)) +
  geom_density()
2.13 Visualization in higher dimensions (video)
2.14 3 variable plot

Instruction:

# Facet hists using hwy mileage and ncyl
common_cyl %>%
  ggplot(aes(x = hwy_mpg)) +
  geom_histogram() +
  facet_grid(ncyl ~ suv) +
  ggtitle("Facet hists using hwy mileage and ncyl")
2.15 Interpret 3 var plot

3. Numerical Summaries

3.1 Measures of center (video)
3.2 Choice of center measure
3.3 Calculate center measures

Instruction 1:

# Create dataset of 2007 data
gap2007 <- filter(gapminder, year >= 2007)

# Compute groupwise mean and median lifeExp
gap2007 %>%
  group_by(continent) %>%
  summarize(mean(lifeExp),
            median(lifeExp))

# Generate box plots of lifeExp for each continent
gap2007 %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_boxplot()
3.4 Measures of variability (video)
3.5 Choice of spread measure
3.6 Calculate spread measures

Instruction:

# Compute groupwise measures of spread
gap2007 %>%
  group_by(continent) %>%
  summarize(sd(lifeExp),
            IQR(lifeExp),
            n())

# Generate overlaid density plots
gap2007 %>%
  ggplot(aes(x = lifeExp, fill = continent)) +
  geom_density(alpha = 0.3)
3.7 Choose measures for center and spread

Instruction:

# Compute stats for lifeExp in Americas
gap2007 %>%
  filter(continent == "Americas") %>%
  summarize(mean(lifeExp),
            sd(lifeExp))

# Compute stats for population
gap2007 %>%
  summarize(median(pop),
            IQR(pop))
3.8 Shape and transformations (video)
3.9 Describe the shape
3.10 Transformations

Instruction:

# Create density plot of old variable
gap2007 %>%
  ggplot(aes(x = pop)) +
  geom_density()

# Transform the skewed pop variable
gap2007 <- gap2007 %>%
  mutate(log_pop = log(pop))

# Create density plot of new variable
gap2007 %>%
  ggplot(aes(x = log_pop)) +
  geom_density()
3.11 Outliers (video)
3.12 Identify outliers

Instruction:

# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
  filter(continent == "Asia") %>%
  mutate(is_outlier = lifeExp < 50)

# Remove outliers, create box plot of lifeExp
gap_asia %>%
  filter(!is_outlier) %>%
  ggplot(aes(x = 1, y = lifeExp)) +
  geom_boxplot()

4. Case Study

4.1 Introducing the data (video)
4.2 Spam and num char

Instruction:

# Load packages
library(ggplot2)
library(dplyr)
library(openintro)

# Compute summary statistics
email %>%
  group_by(spam) %>%
  summarize(median(num_char),
            IQR(num_char))
  
# Create plot
email %>%
  mutate(log_num_char = log(num_char)) %>%
  ggplot(aes(x = spam, y = log_num_char)) +
  geom_boxplot()
4.3 Spam and num char interpretation
4.4 Spam and!!!

Instruction:

# Compute center and spread for exclaim_mess by spam
email %>%
  group_by(spam) %>%
  summarize(median(exclaim_mess),
            IQR(exclaim_mess))
  
# Create plot for spam and exclaim_mess
email %>%
  mutate(log_exclaim_mess = log(exclaim_mess + 0.01)) %>%
  ggplot(aes(x = log_exclaim_mess)) +
  geom_histogram() +
  facet_wrap(~spam)
4.5 Spam and interpretation
4.6 Check-in 1 (video)
4.7 Collapsing levels

Instruction:

# Create plot of proportion of spam by image
email %>%
  mutate(has_image = image > 0) %>%
  ggplot(aes(x = has_image, fill = spam)) +
  geom_bar(position = "fill")
4.8 Image and spam interpretation
4.9 Data Integrity

Instruction:

# Test if images count as attachments
sum(sum(email$image) == sum(email$attach))
4.10 Answering questions with chains

Instruction:

# Question 1
email %>%
  filter(dollar > 0) %>%
  group_by(spam) %>%
  summarize(median(dollar))

# Question 2
email %>%
  filter(dollar > 10) %>%
  ggplot(aes(x = spam)) +
  geom_bar()
4.11 Check-in 2 (video)
4.12 What’s in a number

Instruction:

# Reorder levels
email$number_reordered <- factor(email$number, levels = c("none", "small", "big"))

# Construct plot of number_reordered
ggplot(email, aes(x = number_reordered)) +
  geom_bar() +
  facet_wrap(~spam)
4.13 What’s in a number interpretation
4.14 Conclusion

你可能感兴趣的:(16-R-DataCamp-Exploratory-Data-Analysis-in-R)