For more details, see the html file here.
Once you have loaded you data into R, what you might want to do is manipulate that data, and so you can set it to be a tidy data set: variables in the columns and observations in the rows, and only the observations that you want to be able to analysis.
set.seed(13435)
X <- data.frame("var1" = sample(1:5), "var2" = sample(6:10), "var3" = sample(11:15))
X <- X[sample(1:5), ]
X$var2[c(1,3)] <- NA
X
# select the first column
X[ ,1]
X[ ,"var1"]
X$var1
# select both column and rows
X[1:2, 2]
X[1:2, "var2"]
X[1:2, ]$var2
X[(X$var1 <= 3 & X$var3 > 11), ]
X[(X$var1 <= 3 | X$var3 > 15), ]
X[which(X$var2 > 8), ]
X[X$var2 > 8, ]
From the second line, we can see that if the column has NAs, we cannot use logical statement to select observations, and we need to use which
command.
sort(X$var1)
sort(X$var1, decreasing = TRUE)
# Put NA last
sort(X$var2, na.last = TRUE)
# Put NA first
sort(X$var2, na.last = FALSE)
# NA removes
sort(X$var2)
# order data frame by variable "var1"
X[order(X$var1), ]
# order data frame by multiple variable
X[order(X$var1, X$var3), ]
plyr
packagelibrary(plyr)
arrange(X, var1) #
arrange(X, desc(var1))
# one way to add column
X$var4 <- rnorm(5)
X
# the other way to add column
Y <- cbind(X, rnorm(5))
Y
if(!file.exists("./data")) dir.create("./data")
fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData <- read.csv("./data/restaurants.csv")
# top 3 rows
head(restData, 3)
# last 3 rows
tail(restData, 3)
summary(restData)
str(restData)
quantile(restData$councilDistrict, na.rm = TRUE)
quantile(restData$councilDistrict, probs = c(0.5, 0.75, 0.9))
# useNA = "ifany" gives you the NO.of NAs, the default removing NAs
table(restData$zipCode, useNA = "ifany")
# two dimensional table
table(restData$councilDistrict, restData$zipCode)
sum(is.na(restData$councilDistrict))
any(is.na(restData$councilDistrict))
all(restData$zipCode > 0)
colSums(is.na(restData))
all(colSums(is.na(restData)) == 0)
table(restData$zipCode %in% c("21212"))
table(restData$zipCode %in% c("21212", "21213"))
head(restData[restData$zipCode %in% c("21212", "21213"), ], 3)
data("UCBAdmissions")
DF <- as.data.frame(UCBAdmissions)
summary(DF)
# entry stands for sum of Freq
xt <- xtabs(Freq ~ Gender + Admit, data = DF)
xt
# this is different table: entry stands for NO. of cross observation
with(DF, table(Gender, Admit))
warpbreaks$replicate <- rep(1:9, len = 54)
xt <- xtabs(breaks ~ ., data = warpbreaks)
xt
ftable(xt)
fakeData <- rnorm(1e5)
object.size(fakeData)
print(object.size(fakeData), units = "MB")
Why create new variables?
Common variables to create:
Step 1: Getting data from website
if(!file.exists("./data")) dir.create("./data")
fileUrl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData <- read.csv("./data/restaurants.csv")
# creating sequence by
s1 <- seq(1, 10, by = 2)
s1
# creasting sequence length
s2 <- seq(1, 10, length = 3)
s2
# creating sequence base on a vactor
x <- c(1, 3, 8, 25, 100)
seq(along = x)
restData$nearMe <- restData$neighborhood %in% c("Roland Park", "Homeland")
table(restData$nearMe)
restData$zipWrong <- ifelse(restData$zipCode < 0, TRUE, FALSE)
table(restData$zipWrong, restData$zipCode<0)
restData$zipGroups <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
table(restData$zipGroups)
table(restData$zipGroups, restData$zipCode)
library(Hmisc)
restData$zipGroups <- cut2(restData$zipCode, g = 4)
table(restData$zipGroups)
restData$zcf <- factor(restData$zipCode)
#restData$zcf
class(restData$zcf)
yesno <- sample(c("yes", "no"), size = 10, replace = TRUE)
yesnofac <- factor(yesno, levels = c("yes", "no"))
yesnofac
relevel(yesnofac, ref = "no")
as.numeric(yesnofac) # name the ref level be 1, the other be 2
as.numeric(relevel(yesnofac, ref = "no"))
class(restData$zipGroups)
library(Hmisc) # cut2
library(plyr) # mutate
restData2 <- mutate(restData, zipGroups = cut2(zipCode, g = 4))
table(restData2$zipGroups)
abs(x)
absolute valuesqrt(x)
square rootceiling(x)
ceiling(3.14) = 4floor(x)
floor(3.14) = 3round(x, digits = n)
round(3.1415926, digits = 2) = 3.14signif(x, digits = n)
signif(3.1415926, digits = 2) = 3.1cos(x)
or sin(x)
etc.log(x)
natural logarithmlog2(x)
or log10(x)
other common logs.exp(x)
exponentiating xThe goal is tidy data:
- Each variable forms a column
- Each observation forms a row
- Each table/file stores data about one kind of observation.
library(reshape2)
head(mtcars)
mtcars$carname <- rownames(mtcars)
# make multiple columns to one variable
carMelt <- melt(mtcars, id = c("carname", "gear", "cyl"), measure.vars = c("mpg", "hp") )
# carMelt: a tall and skinny data set
head(carMelt, 3)
tail(carMelt, 3)
# work on the melt data set
clyData <- dcast(carMelt, cyl ~ variable)
cylData <- dcast(carMelt, cyl ~ variable, mean)
head(InsectSprays)
tapply(InsectSprays$count, InsectSprays$spray, sum)
spIns <- split(InsectSprays$count, InsectSprays$spray)
sprCount <- lapply(spIns, sum)
sprCount
unlist(sprCount)
sprCount <- sapply(spIns, sum)
sprCount
plyr
packagelibrary(plyr)
## ddply: Split data frame, apply function, and return results in a data frame.
#ddply(InsectSprays, .(spray), summarize, sum = sum(count))
# Error in .fun(piece, ...) : argument "by" is missing, with no default
ddply(InsectSprays, .(spray), plyr::summarize, sum = sum(count))
spraySums <- ddply(InsectSprays, .(spray), plyr::summarise, sum = ave(count, FUN = sum))
dplyr
package – IntroductionThe data frame is a key data structure in statistics and in R.
dplyr
plyr
package.dplyr
Verbs:
select
: return a subsets of columns of a data frame.filter
: extracts a subsets of rows of a data frame based on a logical condition.arrange
: reorder rows of a data frame.rename
: rename variables in a data frame.mutate
: add new variable/columns or transforming existing variables.summarize
/ summarise
: generate summary statistics of different variables in the data frame, possible within strata.There is also a handy print method that prevents you from printing a lot of data to the console.
dplyr
package – Basic toolslibrary(dplyr)
options(width = 105)
# download data and load data into R
fileUrl <- "https://raw.github.com/DataScienceSpecialization/courses/master/03_GettingData/dplyr/chicago.rds"
if(!file.exists("./data")) dir.create("./data")
download.file(fileUrl, destfile = "./data/chicago.rds", mode = "wb")
chicago <- readRDS("./data/chicago.Rds")
dim(chicago)
str(chicago)
names(chicago)
select
command.head(select(chicago, city:dptp))
head(select(chicago, -(city:dptp)))
i <- match("city", names(chicago))
j <- match("dptp", names(chicago))
head(chicago[, -(i:j)])
filter
command.chic.f <- filter(chicago, pm25tmean2 > 30)
head(chic.f, 3)
chic.f <- filter(chicago, pm25tmean2 >30 & tmpd > 80)
head(chic.f, 3)
chicago <- arrange(chicago, date)
head(chicago, 2)
tail(chicago, 2)
arrange
command.chicago <- arrange(chicago, desc(date))
head(chicago, 2)
tail(chicago, 2)
rename
command.chicago <- rename(chicago, pm25 = pm25tmean2, dewponit = dptp)
head(chicago, 3)
mutate
command.chicago <- mutate(chicago, pm25detrend = pm25 - mean(pm25, na.rm = TRUE))
head(select(chicago, pm25, pm25detrend), 3)
chicago <- mutate(chicago, tempcat = factor(1*(tmpd >80), labels = c("cold", "hot")))
summarize
command.hotcold <- group_by(chicago, tempcat)
head(hotcold)
summarize(hotcold, pm25 = mean(pm25), o3 = max(o3tmean2), no2 = median(no2tmean2))
summarize(hotcold, pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), no2 = median(no2tmean2))
chicago <- mutate(chicago, year = as.POSIXlt(date)$year+1900)
years <- group_by(chicago, year)
summarize(years, pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), no2 = median(no2tmean2))
%>%
command.chicago %>% mutate(month = as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), mo2 = median(no2tmean2))
# =
summarize(group_by(mutate(chicago, month = as.POSIXlt(date)$mon+1), month), pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), mo2 = median(no2tmean2))
if(!file.exists("./data")) dir.create("./data")
# download data set
fileUrl1 <- "https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv"
fileUrl2 <- "https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv"
download.file(fileUrl1, destfile = "./data/reviews.csv")
download.file(fileUrl2, destfile = "./data/solution.csv")
# load data set
reviews <- read.csv("./data/reviews.csv")
solutions <- read.csv("./data/solution.csv")
# view data set
head(reviews, 2)
head(solutions, 2)
names(reviews)
names(solutions)
mergeData <- merge(reviews, solutions, by.x = "solution_id", by.y = "id", all = TRUE)
head(mergeData)
intersect(names(solutions), names(reviews))
mergeData2 <- merge(reviews, solutions, all = TRUE)
head(mergeData2, 2)
plyr
package.df1 <- data.frame(id = sample(1:10), x = rnorm(10))
df2 <- data.frame(id = sample(1:10), y = rnorm(10))
arrange(join(df1, df2), id)
df3 <- data.frame(id = sample(1:10), z = rnorm(10))
dfList = list(df1, df2, df3)
join_all(dfList)