R语言数据预处理

筛选、排序、分段、拆分、分组、合并、拼接、转置、索引、重塑
筛选
1 用which来筛选符合条件的值
wage2_3 <- wage2_1[which(wage2_1$female==1),]

wage2_3

wage2_4 <- wage2_1[which(wage2_1$female==0),]

wage2_4

#2 用subset来筛选符合条件的值
wage2_5_male <- subset(wage2_1,female == ‘0’,select = wage:married)

wage2_5_male

wage2_6_female <- subset(wage2_1,female == ‘1’,select = wage:married)

wage2_6_female

3 用dplyr 中的管道函数来筛选符合条件的值

library(dplyr)

wage2_7_male<- wage2_1 %>%

group_by(female) %>%

filter(female == 0)

library(haven)

WAGE1_DTA <- read_dta(“C:\Users\13886\Desktop\鏌ヨ\R\WAGE1.DTA.dta”)

View(WAGE1_DTA)

wage2 <- data.frame(wage = WAGE1_DTA$wage,

educ = WAGE1_DTA$educ,

exper = WAGE1_DTA$exper,

tenture = WAGE1_DTA$tenure,

female = WAGE1_DTA$female,

married = WAGE1_DTA$married)

wage2

library(dplyr)

wage2 %>% separate(female, c(“feamale”, “male”))

wage2_male<- wage2 %>%

group_by(female) %>%

filter(female == 0)

wage2_female<- wage2 %>%

group_by(female) %>%

filter(female == 1 )

wage2_male

wage2_female

wage3 <- log(wage2)

wage3

wage4 <- rbind(wage2,wage3)

wage4

wage5 <- cbind(wage2,wage3)

wage5

wage2_male_1<- wage2_male %>%

group_by(exper) %>%

filter(exper >20 & exper<40 )

wage2_male_1

a<- wage2 %>%

group_by(exper) %>%

filter(exper > 20 )

a

b <- wage2_male %>%

group_by(exper) %>%

filter(exper < 40 & exper >20)

b

library(dplyr)

options(digits = 0)

set.seed(1)

df <- data.frame(ID = 1:12,

Class = rep(c(1,2,3),4),

Chinese = runif(12,min = 0,max = 100),

Math = runif(12,min = 0,max = 100),

English = runif(12,min = 0,max = 100))

for (i in 1:ncol(df)) {

df[,i] <- as.integer(df[,i])

}

查找与修改

查找

df

df[2,]

df[,4]

df$Chinese

df[3]

df[which(df$ID==4),]

df[3,4]

df[3,‘Math’]

df[which(df$Chinese == 57), ‘Math’]

df[which(df$Class ==2),‘English’]

修改行或者列

修改某一行或列

df[1,] <- c(1,2,65,59,73)

df[,‘English’] <- c(23,45,67,87,34,46,87,95,43,76,23,94)

df

修改某一个值 直接将需要修改后的值

赋值给上述查询某个值的操做即可
df[which(df$Chinese <20), ‘Chinese’] <-20

df

修改行列名

可用rownames()及colnames()得到数据框的行列名,
rownames(data.frame)[行号] 或 colnames(data.frame)[列号]
可得到指定位置的行名或者列名

若修改直接赋值给该变量即可

colnames(df)

colnames(df)[4]

colnames(df)[4] <- ‘math’

colnames(df) <- c(“ID”,“Class”,“Chinese”,“Math”,“English”)

colnames(df)

删除

删除行或列,仅需要选出该数据框的部分行或列,然后将其赋给该变量即可,
其中在列号或行号前添加-表示不选该行或该列
在这里,为了方便接下来的操作,我们将选出后的数据框赋给其他变量
要实现删除操作应当将选出后的数据框赋给自己
df.tmp <- df[, c(1,3,5)]

df.tmp

df.tmp <- df[-3,]

df.tmp

添加

添加行 data.frame [新行号,] <- 行值
df[13,] <- c(13,2,62,19,38)

df

df <- df[c(1,1:12),]

df

df <- df[rep(1:12,each = 2),]

df

添加列 data.frame$ 新列名 <- 列名
df$Physics <- c(23,34,67,23,56,67,78,23,54,56,67,34,50)

df

df[,7] <- c(1:13)

df

mutate(df,Chemistry = Chinese + Math + English +Physics)

dplyr 包常用函数
library(dplyr)

options(digits = 0)

set.seed(1)

df <- data.frame(ID = 1:12,

Class = rep(c(1,2,3),4),

Chinese = runif(12,min = 0,max = 100),

Math = runif(12,min = 0,max = 100),

English = runif(12,min = 0,max = 100))

for (i in 1:ncol(df)) {

df[,i] <- as.integer(df[,i])

}

df

arrange()排序

arrange(df,Chinese)

arrange(df,Chinese,Math)

arrange(df,desc(Chinese))

distinct()函数 去重

df1 <- df[rep(1:nrow(df),each = 2),]

df1

df1 <- distinct(df1)

df1

group_by()函数分组 summarise()函数概括
group_by()与summarise()函数常连用,
用于对不同的分组进行操作,在这里再介绍一个管道函数“%>%”,
其作用是把左件的值发送给右件的表达式,
并作为右件表达式函数的第一个参数
df %>%

group_by(Class) %>%

summarise(max = max(Chinese))

#filter()函数 筛选
filter(.data, …, .preserve = FALSE)
选出符合条件的行(返回数据框格式)
df %>%

group_by(Class) %>%

filter(Chinese == max(Chinese))

select()函数 选择

select(df,ID,Chinese,Math,English)

rbind()函数与cbind()函数 合并

df1 <- data.frame(ID = 13, Class = 2,

Chinese = 65, Math = 26, English = 84)

df1

rbind(df,df1)

library(dplyr)

options(digits = 0)

set.seed(1)

df <- data.frame(ID = 1:12,

Class = rep(c(1,2,3),4),

Chinese = runif(12,min = 0,max = 100),

Math = runif(12,min = 0,max = 100),

English = runif(12,min = 0,max = 100))

for (i in 1:ncol(df)) {

df[,i] <- as.integer(df[,i])

}

dplyr 包常用函数
library(dplyr)

options(digits = 0)

set.seed(1)

df <- data.frame(ID = 1:12,

Class = rep(c(1,2,3),4),

Chinese = runif(12,min = 0,max = 100),

Math = runif(12,min = 0,max = 100),

English = runif(12,min = 0,max = 100))

for (i in 1:ncol(df)) {

df[,i] <- as.integer(df[,i])

df

你可能感兴趣的:(r语言,大数据,数据分析)