基础篇----R环境安装及基本语句:
(65条消息) windows 下载 R 和 R studio,以及R语言基本使用_up_xin的博客-CSDN博客https://blog.csdn.net/weixin_41427508/article/details/128148008?spm=1001.2014.3001.5502
目录
一. 数据整理 -- tidyverse包
1. 数据导入导出及读取
2. 类似数据库操作:mutate新建变量、filter/select 过滤行/列、summary+group_by分组汇总、4种join数据合并 (管道操作符:前结果可作为后参数)
3. seperate/unite 列的分裂/合并,pivot_longer/wider 长宽数据转换
二. 可视化作图
三. 统计应用
# 数据操作
library(tidyverse)
# https://www.tidyverse.org/
# csv 数据导入
rawdata <- read.table(file.choose(), header=T, sep=",")
head(rawdata, n=4)
tail(rawdata, n=10)
rawdata[95:105,]
str(rawdata)
# read.csv(file.choose())
# data.table::fread(file.choose())
# csv 数据导出
write.table(rawdata,
"test.csv",
sep=",",
row.names=F)
# write.csv()
# data.table::fwrite()
# 读取excel表
library(readxl)
# excel_sheets(file.choose())
datal <- read_excel(file.choose())
# 批量读取数据
files <- list.files(".\\房地产PB\\")
files
paths <- paste(".\\房地产PB\\", files, sep="")
paths
df <- list()
for (i in 1:lengh(paths)){
datai <- read_excel(paths[i])
datai$object <- str_sub(files[i], start=1, end=-6)
df[[i]] <- datai
print(i)
}
df_all <- bind_rows(df)
# dplyr
library(dplyr)
head(ToothGrowth)
str(ToothGrowth)
# 新建变量和变量重新赋值
toothgrowth2 <- mutate(ToothGrowth,
len = len^2,
nv = 1:nrow(ToothGrowth),
nv2 = ifelse(nv > median(nv), "H", "L"))
head(toothgrowth2)
# 筛选行(样本)
toothgrowth3 <- filter(toothgrowth2, nv %in% 1:50, nv2 == "H")
toothgrowth3
# 筛选列(变量)
toothgrowth4 <- select(toothgrowth3, c(2,4)) # 也可用一些匹配,例如 start with / end with
toothgrowth4
# 分组计算
summarise(ToothGrowth, len_max=max(len), len_min=min(len))
summarise(group_by(ToothGrowth, supp), len_max = max(len))
summarise(group_by(ToothGrowth, dose, supp), len_max = max(len))
# 管道操作符
library(magrittr)
ToothGrowth %>%
mutate(nv = 1:nrow(ToothGrowth)) %>%
filter(nv %in% 1:50) %>%
select(1:2) %>%
group_by(supp) %>%
summarise(len_max = max(len)) %>%
as.data.frame()
# 连接、合并数据框
library(dplyr)
df1 <- data.frame(c1 = 2:5,
c2 = LETTERS[2:5])
df1
df2 <- data.frame(c3 = LETTERS[c(2:3, 20:23)],
c4 = sample(1:100, size=6))
df2
# 左连接
left_join(df1, df2, by = c('c2' = 'c3'))
df1 %>% left_join(df2, by = c('c2'='c3'))
# 右连接
df1 %>% right_join(df2, by = c('c2'='c3'))
# 全连接
df1 %>% full_join(df2, by = c('c2'='c3'))
# 内连接
df1 %>% inner_join(df2, by = c('c2'='c3'))
1)列的分类/合并
# 列的分裂与合并
library(tidyr)
# 分裂
df3 <- data.frame( c5 = paste(letters[1:3], 1:3, sep = "-"),
c6 = paste(letters[1:3], 1:3, sep = "."),
c4 = c("B", "B", "B"),
c3 = c("H", "M", "L"))
df3
df4 <- df3 %>%
separate( col = c5, sep = "-", into = c("c7","c8"), remove=F) %>%
separate( col = c6, sep = "\\.", into = c("c9","c10"), remove=T)
df4
# 合并
df4 %>%
unite( col = "c11", c("c7","c8"), sep = "-", remove=F) %>%
unite( col = "c12", c("c9","c10"), sep = ".", remove=T) %>%
unite( col = "c13", c("c4","c3"), sep = "", remove=F)
2) 长宽数据转换----画图通常要先转换为长数据
# 长宽数据转换
library(tidyr)
set.seed(42)
df5 <- data.frame(time = rep(2011:2013, each=3),
area = rep(letters[1:3], times=3),
pop = sample(100:1000, 9),
den = round(rnorm(9, mean = 3, sd=0.1), 2),
mj = sample(8:12, 9, replace=T))
df5
# 宽数据转长数据
df6 <- df5 %>%
pivot_longer(cols = 3:5,
names_to = "varb",
values_to = "value")
df6
# 长数据转宽数据
df7 <- df6 %>%
pivot_wider( names_from = c(area, varb),
values_from = value)
df7