R中用来操作excel的包比较多,常见的包括readxl,openxlsx,xlsx等,对于较大的数据集xlsx包无论是读取还是写入都比较慢,我最常用的还是openxlsx,如果仅仅是读取也可以考虑readxl
setwd('C:\\Users\\lstid\\Desktop\\test')
library(data.table)
df <- read.csv('测试文档.txt',header = F)
df <- fread('测试文档.txt',header = F,encoding = 'UTF-8')
setwd('C:\\Users\\lstid\\Desktop\\test')
library(plyr)
con <- file("测试文档.txt", "r")
line=readLines(con,n=1,encoding = "UTF-8")
result <- data.frame()
while(length(line) != 0 ) {
res <- as.data.frame(line)
result <- plyr::rbind.fill(result,res)
line=readLines(con,n=1)
}
close(con)
library(readxl)
df <- read_xlsx('demo.xlsx',sheet = 'Sheet1')
library(openxlsx)
# 这里需要注意的是列名设置,T和F的不同导致结果的第一列不一样,其他参数可以查看帮助文档
df <- read.xlsx('demo.xlsx', sheet = 'Sheet1', startRow = 5, colNames = FALSE)
library(RMySQL)
# 建立链接
con <- dbConnect(RMySQL::MySQL(), # 驱动
host = 'localhost', # 数据库地址
port = 3306, # 端口号
username = 'root', # 账户名
password = 'yan1224',# 账户密码
dbname = 'test' # 数据库名称)
# 查询(读取)
select_sql <- 'select * from table_name limit 1'
select_res <- dbGetQuery(con,select_sql)
# 写入
df <- iris
# 主要参数是append 还是oevrwrite
dbWriteTable(con,'iris',df,row.names = F,append = T)
# 关闭链接
dbDisconnect(con)
### 在mysql数据库的操作过程中,由于中文经常出现乱码 所以最关键的就是对字符编码的设置
dbSendQuery(con,'SET NAMES gbk')
# 大部分用gbk可以解决,但有时候又要设置为utf8,根据实际情况进行调整
df <- read_xlsx('demo.xlsx',sheet = 'Sheet1')
head(df,3) # 指定读取几行数据,默认6行
# A tibble: 3 x 3
a b c
1 1 s 10
2 2 qe 11
3 3 r 12
df <- read_xlsx('demo.xlsx',sheet = 'Sheet1')
tail(df,5) # 指定读取几行数据,默认6行
# A tibble: 5 x 3
a b c
1 15 r 24
2 16 hy9 25
3 17 s 26
4 18 qe 27
5 19 r 28
df <- read_xlsx('demo.xlsx',sheet = 'Sheet1')
str(df)
# 结果如下:
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 19 obs. of 3 variables:
$ a: num 1 2 3 4 5 6 7 8 9 10 ...
$ b: chr "s" "qe" "r" "hy6" ...
$ c: num 10 11 12 13 14 15 16 17 18 19 ...
library(readxl)
library(mice)
df <- read_xlsx('demo.xlsx',sheet = 'Sheet1')
md.pattern(df)
# 结果
a b c
19 1 1 1 0
3 1 0 0 2
0 3 3 6
library(dplyr)
mtcars %>% filter(cyl == 4)
mpg cyl disp hp drat wt qsec vs am gear carb
1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
4 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
5 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
6 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
8 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
9 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
10 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
11 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
library(dplyr)
mtcars %>% filter(cyl == 4 & vs == 0)
mpg cyl disp hp drat wt qsec vs am gear carb
1 26 4 120.3 91 4.43 2.14 16.7 0 1 5 2
library(dplyr)
mtcars %>% filter(cyl == 4 | mpg %in% c(21,22.8))
mpg cyl disp hp drat wt qsec vs am gear carb
1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
4 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
5 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
6 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
7 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
8 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
9 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
10 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
11 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
12 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
13 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
library(dplyr)
mtcars %>% slice(1)
mpg cyl disp hp drat wt qsec vs am gear carb
1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
mtcars %>% slice(n()) # 选取最后一行
mpg cyl disp hp drat wt qsec vs am gear carb
1 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
library(dplyr)
mtcars %>% select(mpg,cyl) %>% head()
mtcars %>% select(starts_with('c')) %>% head()
mtcars %>% select(ends_with('t')) %>% head()
drat wt
3.90 2.620
3.90 2.875
3.85 2.320
3.08 3.215
3.15 3.440
2.76 3.460
mtcars %>% select(contains('s')) %>% head()
disp qsec vs
160 16.46 0
160 17.02 0
108 18.61 1
258 19.44 1
360 17.02 0
225 20.22 1
library(dplyr)
mtcars %>% pull(vs)
[1] 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1
library(dplyr)
iris %>% rename(花萼长度 = Sepal.Length) %>% head()
花萼长度 Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
library(dplyr)
iris %>% select(花萼长度 = Sepal.Length) %>% head()
花萼长度
1 5.1
2 4.9
3 4.7
4 4.6
5 5.0
6 5.4
library(dplyr)
iris %>% arrange(Sepal.Length,Petal.Length) %>% head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 4.3 3.0 1.1 0.1 setosa
2 4.4 3.0 1.3 0.2 setosa
3 4.4 3.2 1.3 0.2 setosa
4 4.4 2.9 1.4 0.2 setosa
5 4.5 2.3 1.3 0.3 setosa
6 4.6 3.6 1.0 0.2 setosa
library(dplyr)
mtcars %>% dplyr::arrange(desc(mpg,disp)) %>% head()
mpg cyl disp hp drat wt qsec vs am gear carb
1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
2 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
3 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
4 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
5 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
6 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
library(dplyr)
mtcars %>% mutate(新列 = 100) %>% head()
mtcars %>% mutate(new_cyl = cyl * 100) %>% head()
mpg cyl disp hp drat wt qsec vs am gear carb new_cyl
1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 600
2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 600
3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 400
4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 600
5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 800
6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 600
mtcars %>% transmute(new_cyl = cyl * 100) %>% head()
new_cyl
1 600
2 600
3 400
4 600
5 800
6 600
library(dplyr)
mtcars %>% distinct(cyl)
cyl
1 6
2 4
3 8
library(dplyr)
mtcars %>% distinct(cyl,.keep_all = T)
mpg cyl disp hp drat wt qsec vs am gear carb
1 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
2 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
3 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
library(tidyr)
data1 <- data.frame(id= c('A','B','C'),time = 1:3,lr = c(100,200,300),ht = c(50,60,70))
id time lr ht
1 A 1 100 50
2 B 2 200 60
3 C 3 300 70
df4 <- unite(data1,id_time,id,time,sep='-') #中间的分割符号可以随意设置
id_time lr ht
1 A-1 100 50
2 B-2 200 60
3 C-3 300 70
df5 <- separate(df4,id_time,c('id','time'),sep = '-')
id time lr ht
1 A 1 100 50
2 B 2 200 60
3 C 3 300 70
gather函数是将原来宽表的变量作为新的一列,用以分类,原来宽表的值逐一对应。参数key是给予新的分类变量命名,value是给予新表数值型变量命名,对原来表的变量前加 - 表示该变量不参与变形,保持单独一列。
library(tidyr)
data1 <- data.frame(id= c('A','B','C'),time = 1:3,lr = c(100,200,300),ht = c(50,60,70))
id time lr ht
1 A 1 100 50
2 B 2 200 60
3 C 3 300 70
df2 <- gather(data1,key = xx,value = yy,-id,-time)
id time xx yy
1 A 1 lr 100
2 B 2 lr 200
3 C 3 lr 300
4 A 1 ht 50
5 B 2 ht 60
6 C 3 ht 70
spread函数与gather互相对应,只是变形之后列的顺序可能发生变化。(注意与data1的对比)
df3 <- spread(df2,key= xx,value = yy )
id time ht lr
1 A 1 50 100
2 B 2 60 200
3 C 3 70 300
#新增一列排序,row_number
diamonds %>% select(price) %>% mutate(price_rn = row_number(price)) %>% head(6)
price price_rn
1 326 1
2 326 2
3 327 3
4 334 4
5 335 5
6 336 6
#新增一列排序,min_rank
diamonds %>% select(price) %>% mutate(price_mrank = min_rank(price)) %>% head(6)
price price_mrank
1 326 1
2 326 1
3 327 3
4 334 4
5 335 5
6 336 6
#新增一列排序,dense_rank
diamonds %>% select(price) %>% mutate(price_drank = dense_rank(price)) %>% head(6)
price price_drank
1 326 1
2 326 1
3 327 2
4 334 3
5 335 4
6 336 5
diamonds %>%
arrange(price) %>%
select(price) %>%
mutate(price_lead1 = lead(price,1),
price_lead2 = lead(price,2),
price_lag1 = lag(price,1),
price_lag2 = lag(price,2)) %>%
head(5)
price price_lead1 price_lead2 price_lag1 price_lag2
1 326 326 327 NA NA
2 326 327 334 326 NA
3 327 334 335 326 326
4 334 335 336 327 326
5 335 336 336 334 327
diamonds %>%
select(price) %>%
head(6) %>%
mutate(price_cumsum = cumsum(price),
price_cummean = cummean(price),
price_cummax = cummax(price),
price_cummin = cummin(price),
price_cumprod = cumprod(price))
price price_cumsum price_cummean price_cummax price_cummin
1 326 326 326 326 326
2 326 652 326 326 326
3 327 979 326. 327 326
4 334 1313 328. 334 326
5 335 1648 330. 335 326
6 336 1984 331. 336 326
diamonds %>%
head(6) %>%
select(price) %>%
summarise(price_first = first(price),
price_last = last(price),
price_nthprice = nth(price,3),
price_n = n(),
price_disn = n_distinct(price))
price_first price_last price_nthprice price_n price_disn
1 326 336 327 6 5
sample_n(mtcars, 10)
sample_n(mtcars, 50, replace = TRUE)
sample_n(mtcars, 10, weight = mpg)
在R中group_by函数通常与summarise函数一起使用,完成分组聚合操作
library(dplyr)
# 统计每个分组下的记录数
mtcars %>%
group_by(cyl) %>%
summarise(n())
cyl `n()`
1 4 11
2 6 7
3 8 14
# 统计每个分组中指定列的平均值和中位数
mtcars %>%
group_by(cyl) %>%
summarise(mean(disp),
median(hp))
cyl `mean(disp)` `median(hp)`
1 4 105. 91
2 6 183. 110
3 8 353. 192.