Select

基本格式：

counties %>%

select(字段)

可以用冒号来选择一个范围内的字段：

counties %>%

select(state, county, population, professional:production)

还可以用start_with, ends_with, contain 等等模糊匹配字段：

counties %>%

select(state, county, population, ends_with("work"))

Filter

基本格式：

counties %>%

filter(条件)

in的使用：

selected_names <- babynames %>%

filter(name %in% c("Steven", "Thomas", "Matthew"))

Arrange

按哪些字段排序，基本格式：

counties %>%

arrange(字段) ---默认为升序

arrange(desc(字段)) ---降序

Mutate

添加新字段，基本格式：

counties %>%

mutate(字段 = xxxxx)

Transmute

选择就字段且添加新字段，基本格式：

counties %>%

transmute(旧字段，新字段 = xxxxx)

Rename

字段重命名，基本格式：

counties %>%

rename(新字段名 = 旧字段名)

也可以在select里，选取的时候直接重命名：

counties %>%

select(字段......，新字段名 = 旧字段名)

混合使用示例

counties %>%

# Select the five columns

select(state, county, population, men, women) %>%

# Add the proportion_men variable

mutate(proportion_men = men/population) %>%

# Filter for population of at least 10,000

filter(population >= 10000) %>%

# Arrange proportion of men in descending order

arrange(desc(proportion_men))

下面开始聚合函数喽！

Count

按字段分组，数每个分组下的个数，基本格式：

counties_selected %>%

count(region, sort = TRUE)

可加入权重wt，按字段1分组，数每个分组下的字段2总数，基本格式：

counties_selected %>%

count(state, wt = citizens, sort = TRUE)

相当于：

counties_selected %>%

group_by(state) %>%

summarise(sum(citizens))

Group_by

按字段分组，基本格式：

counties_selected %>%

group_by(字段)

Ungroup

取消分组（一般是为了另外再进行其他的计算），基本格式：

counties_selected %>%

group_by(字段) %>%

计算1 %>%

ungroup() %>%

计算2

例子：

# Count the states with more people in Metro or Nonmetro areas

counties_selected %>%

group_by(state, metro) %>%

summarize(total_pop = sum(population)) %>%

top_n(1, total_pop) %>%

ungroup() %>%

count(metro)

# Find the year each name is most common

babynames %>%

group_by(year) %>%

mutate(year_total = sum(number)) %>%

ungroup() %>%

mutate(fraction = number / year_total) %>%

group_by(name) %>%

top_n(1, fraction)

Summarise

计算字段的聚合函数值，基本格式：

counties_selected %>%

summarize(新字段名1 = min(字段1)，新字段名2 = max(字段2)，…… )

例子：

# Add a density column, then sort in descending order

counties_selected %>%

group_by(state) %>%

summarize(total_area = sum(land_area),

total_population = sum(population)) %>%

mutate(density = total_population / total_area) %>%

arrange(desc(density))

# Calculate the average_pop and median_pop columns

counties_selected %>%

group_by(region, state) %>%

summarize(total_pop = sum(population)) %>%

summarize(average_pop = mean(total_pop),

median_pop = median(total_pop))

注意：上一行的计算结果可以马上给下一行计算哦！

Top_n

只按字段2取最高的n个值，常配合分组使用。基本格式：

counties_selected %>%

group_by(字段1) %>%

top_n(个数, 字段2)

例子：

counties_selected %>%

group_by(region, state) %>%

# Calculate average income

summarize(average_income = mean(income)) %>%

# Find the highest income state in each region

top_n(1, average_income)

Dplyr笔记