R -- dplyr学习

brief

dplyr包主要的五个函数以及工作目标

  • Pick observations by their values (filter()).
  • Reorder the rows (arrange()).
  • Pick variables by their names (select()).
  • Create new variables with functions of existing variables (mutate()).
  • Collapse many values down to a single summary (summarise()).

而且函数的表达式的构造很类似:
R -- dplyr学习_第1张图片

filter()函数

rm(list=ls())
library(tidyverse)
# install.packages("nycflights13")
library(nycflights13)

data(flights)
head(flights)

# Pick observations by their values ==> filter()

filter(flights, month == 1, day == 1) # 第一个参数是df,随后的参数是关于variable的表达式
# 返回的是一个new df,并未对input进行任何操作

# R either prints out the results, or saves them to a variable. 
# If you want to do both, you can wrap the assignment in parentheses
(dec25 <- filter(flights, month == 12, day == 25))

# 运用boolean 操作符构建表达式进行筛选
# & is “and”, | is “or”, and ! is “not”
filter(flights, month == 11 | month == 12)

# 使用成员运算符构建表达式
filter(flights, month %in% c(11, 12))
filter(flights, is.na(month) | month > 10)

# 使用between()构建表达式
# between(x, left, right) is a shortcut for x >= left & x <= right 
filter(flights, between(month,10,12))

arrange()函数

# Reorder the rows ==> arrange()

# each additional column will be used to break ties in the values of preceding columns
arrange(flights, year, month, day)
# Use desc() to re-order by a column in descending order
arrange(flights, desc(dep_delay))

select()函数

# Pick variables by their names ==> select()
# Select columns by name
select(flights, year, month, day)
# Select all columns between year and day (inclusive)
select(flights, year:day)
# Select all columns except those from year to day (inclusive)
select(flights, -(year:day))
# for taking the complement of a set of variables
select(flights,!year)
# & and | for selecting the intersection or the union of two sets of variables
select(flights,starts_with("Petal") | ends_with("Width"))
select(flights,starts_with("Petal") | !ends_with("Width"))
# last_col(): Select last variable, possibly with an offset
select(flights,last_col())
# starts_with(): Starts with a prefix
select(flights,starts_with("Petal") | ends_with("Width"))
# ends_with(): Ends with a suffix
select(flights,starts_with("Petal") | ends_with("Width"))
# contains(): Contains a literal string.
select(flights,contains("qwer"))
# matches(): Matches a regular expression.
select(flights,contains("^qwer"))
# num_range("x", 1:3): matches x1, x2 and x3
select(flights,num_range("x", 1:3))

# all_of(): Matches variable names in a character vector.
# All names must be present, otherwise an out-of-bounds error is thrown
x <- colnames(flights)[2:8]
select(flights,all_of(x))

# any_of(): Same as all_of(), except that no error is thrown for names that don't exist.
x <- colnames(flights)[2:8]
select(flights,any_of(x))

mutate()函数

# Create new variables with functions of existing variables ==> mutate()
# functions不止数学运算,还有布尔运算,log transform,模运算,累计函数,差分等
# mutate() always adds new columns at the end of your dataset
mutate(flights,
       gain = dep_delay - arr_delay,
       speed = distance / air_time * 60
)
# Note that you can refer to columns that you’ve just created
mutate(flights,
       gain = dep_delay - arr_delay,
       hours = air_time / 60,
       gain_per_hour = gain / hours
)

# If you only want to keep the new variables, use transmute(),单独保留出来
transmute(flights,
          gain = dep_delay - arr_delay,
          hours = air_time / 60,
          gain_per_hour = gain / hours
)

summarise()函数

# Collapse many values down to a single summary ==> summarise()
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))


by_day <- group_by(flights, year, month, day)
head(by_day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

你可能感兴趣的:(R,统计学,r语言)