教程:
1、R for Data Science: http://r4ds.had.co.nz/introduction.html
R数据科学采用金字塔倒叙结构,先讲意图,后讲呈现后拆解步骤及基础知识
常用包:
(1)tidyverse
library(tidyverse)
tidyverse包涵盖:
可视化ggplot2
数据处理dplyr、tidyr
数据导入readr
函数式编写purr
其他tibble、stringr、forcats
①dplyr
everything适用于:将部分字段前移
select(flights, time_hour, air_time, everything())
select中常用:
starts_with("abc")
ends_with("xyz")
contains("ijk")
matches("(.)\\1")
rename:重命名
rename(flights, tail_num = tailnum)
library(tidyverse)
library(nycflights13)
##1 filter 筛选
jan1<-filter(flights, month == 1, day == 1)
dec25 <- filter(flights, month == 12, day == 25)
#飞往休斯顿(IAH或HOU)
filter(flights,dest=="IAH"|dest=="HOU")
##2 arrange 排序,Na值经常被排在最后
arrange(flights, year, month, day)
arrange(flights,desc(dep_delay))
df <- tibble(x = c(5, 2, NA))
arrange(df,x)
arrange(df, desc(x))
##3 select 选取,其中,-表示不包含某列
select(flights, year, month, day)
select(flights, -(year:day))
select(iris, starts_with("Petal"))
select(iris, ends_with("Width"))
#重命名
rename(flights, tail_num = tailnum)
#将某些列移到前面位置
select(flights, time_hour, air_time, everything())
##4 mutate 添加新变量,原有列基础上追加
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml,gain=dep_delay-arr_delay,speed=distance/air_time*60)
#transmute仅保留新变量
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
#4.1累计聚合公式 cumsum(),cumprop(),cummin(),cummax(),cummean()
x<-1:10
cumsum(x)
cummean(x)
#5 ranking相关min_rank(),默认状态下所有函数中NA均不参与排名
y<-c(1,2,2,NA,3,4)
#重复数字取同一排名,跳过相同排名往后取(1,2,2,NA,4,5)
min_rank(y)
#重复数字取同一排名,降序然后跳过相同排名往后取(5,3,3,NA,2,1)
min_rank(desc(y))
#重复数字取不同排名(1,2,3,NA,4,5)
row_number(y)
#重复数字取同一排名,不跳过相同排名(1,2,2,NA,3,4)
dense_rank(y)
#重复数字取同一排名,不跳过相同排名(0.00 0.25 0.25 NA 0.75 1.00)
percent_rank(y)
#重复数字取同一排名,不跳过相同排名(0.2 0.6 0.6 NA 0.8 1.0),不低于排名的数量/总排名数
cume_dist(y)
#6 聚合summarise
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
#先按照给定字段分组,然后再聚合
#常规算法
by_dest <- group_by(flights, dest)
delay<-summarise(by_dest, count=n()
,dist=mean(distance,na.rm = TRUE)
,delay=mean(arr_delay,na.rm = TRUE))
delay<-filter(delay,count>20,dest!="HNL")
ggplot(data=delay,mapping = aes(x=dist,y=delay))+
geom_point(aes(size=count),alpha=0.3)+
geom_smooth(se=FALSE)
#管道函数,其实是常规写法的思路从左到右、从上到下,整体写法与正常思维方式特吻合
#na.rm=TRUE,剔除NA值部分,否则会把聚合函数都返回NA值结果
delays<-flights %>%
group_by(dest) %>%
summarise(count=n()
,dist=mean(distance,na.rm = TRUE)
,delay=mean(arr_delay,na.rm = TRUE))%>%
filter(count>20,dest!="HNL")
ggplot(data=delays,mapping = aes(x=dist,y=delay))+
geom_point(aes(size=count),alpha=0.3)+
geom_smooth(se=FALSE)
delay1 <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay)) %>%
group_by(tailnum)%>%
summarise(delay=mean(arr_delay,na.rm=TRUE),n=n())
ggplot(data=delay1,mapping =aes (x=delay))+geom_freqpoly(binwidth=10)
delay1%>%
filter(n>25)%>%
ggplot(data=delay1,mapping = aes(x=n,y=delay))+geom_point(alpha=0.3)
#ungroup()移除已有分组
popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365) %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
②tibble 只显示前十行,与str类似,可显示数据类型,循环输入数据
library(tidyverse)
##,tibble只显示前十行,与str类似,可显示数据类型,循环输入数据(e.g.)
as_tibble(iris)
#①
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
#②
tb <- tibble(
`:)` = "smile",
` ` = "space",
`2000` = "number"
)
tb
tribble(
~x, ~y, ~z,
"a", 2, 3.6,
"b", 1, 8.5
)
③
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
#excerise
#1.How can you tell if an object is a tibble? (Hint: try printing mtcars, which is a regular data frame).
str(mpg) #Classes ‘tbl_df’, ‘tbl’ and 'data.frame'
#2.Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration?
df <- data.frame(abc = 1, xyz = "a")
df$x # df1$xyz tibble格式要求的 更为严格
df[, "xyz"]
df[, c("abc", "xyz")]
df1<-as.tibble(df)
#3. If you have the name of a variable stored in an object
#e.g. var <- "mpg", how can you extract the reference variable from a tibble?
mpg$manufacturer
③read
read_csv()逗号分隔
read_csv2()分号分隔
read_tsv() tab分隔
read_delim() 分隔符分隔
read_fwf()
固定宽度的文件
#read_csv 跳过N行
read_csv("The first line of metadata The second line of metadata
x,y,z
1,2,3", skip = 2)
#忽略行标题
read_csv("1,2,3\n4,5,6", col_names = FALSE)
#添加行标题
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
#Na值
read_csv("a,b,c\n1,2,.", na = ".")