数据预处理:dplyr package sample

library(dplyr)

options(width=105)


chicago<-readRDS("chicago,rds")

dim(chicago)

str(chicago)

names(chicago)


head(select(chicago), city:dptp))  #把city到dptp之间的变量都选出来

head(select(chicago), -(city:dptp)))  #把除city到dptp之间的变量外的都选出来


#如果不用dplyr,代码是

# i<-match("city", names(chicago))

# j<-match("dptp", names(chicago))

# head(chicago[, -(i:j)])


# FILTER

chic.f <-filter(chicago, pm25tmean2>30)

chic.f <-filter(chicago, pm25tmean2>30 & tmpd>80)

head(chic.f)


# Arrange

chicago <- arrange(chicago, date)   # 从小到大排列

chicago <- arrange(chicago, desc(date))    # 从大到小排列

head(chicago); tail(chicago)


# Rename

chicago <- rename(chicago, pm25=pm25mean2, dewpoint=dptp)


# MUTATE

chicago<-mutate(chicago, pm25detrend=pm25-mean(pm25, na.rm=TRUE))

# 创造新变量


#goup_by

#此组代码目的在于求出在天气冷热的不同条件下,空气污染的均值、最高值会否有差异

chicago<-mutate(chicago, tempat=factor(1*(tmpd>80),labels=c("cold","hot")))

hotcold<-group_by(chicago, tempcat)

summarize(hotcold, pm25=mean(pm25),o3=max(o3tmean2),no2=median(no2tmean2))

summarize(hotcold, pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

# 忽略pm25中的missing value


# Summarize 

#此组代码目的在于求出不同年份的数据有无差异

chicago<-mutate(chicago, year=as.POSIXlt(date)$year + 1900)

# 此方法可以简单地得到年份数据

years<-group_by(chicago, year)

summarize(years, pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))


# Pipeline Operation

chicago %>% mutate(month=as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25=mean(pm25, na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))

你可能感兴趣的:(数据预处理:dplyr package sample)