R数据处理学习

 

 

 

 

 

教程:

1、R for Data Science:  http://r4ds.had.co.nz/introduction.html 

R数据科学采用金字塔倒叙结构,先讲意图,后讲呈现后拆解步骤及基础知识

常用包:

(1)tidyverse

library(tidyverse)

tidyverse包涵盖:

可视化ggplot2

数据处理dplyr、tidyr

数据导入readr

函数式编写purr

其他tibble、stringr、forcats

①dplyr

everything适用于:将部分字段前移

select(flights, time_hour, air_time, everything())

select中常用:

            starts_with("abc")

            ends_with("xyz")

             contains("ijk")

             matches("(.)\\1")

               

rename:重命名

rename(flights, tail_num = tailnum)

 

library(tidyverse)
library(nycflights13)
##1 filter 筛选
      jan1<-filter(flights, month == 1, day == 1)
      dec25 <- filter(flights, month == 12, day == 25)
    #飞往休斯顿(IAH或HOU)
      filter(flights,dest=="IAH"|dest=="HOU")

##2 arrange 排序,Na值经常被排在最后
      arrange(flights, year, month, day)
      arrange(flights,desc(dep_delay))
    
    
      df <- tibble(x = c(5, 2, NA))
      arrange(df,x)
      arrange(df, desc(x))
    
##3 select 选取,其中,-表示不包含某列
      select(flights, year, month, day)
      select(flights, -(year:day))
      select(iris, starts_with("Petal"))
      select(iris, ends_with("Width"))
    #重命名
      rename(flights, tail_num = tailnum)
    #将某些列移到前面位置
      select(flights, time_hour, air_time, everything())
    
##4 mutate 添加新变量,原有列基础上追加
      flights_sml <- select(flights, 
                            year:day, 
                            ends_with("delay"), 
                            distance, 
                            air_time
      )
      mutate(flights_sml,gain=dep_delay-arr_delay,speed=distance/air_time*60)  
    #transmute仅保留新变量
      transmute(flights,
                gain = dep_delay - arr_delay,
                hours = air_time / 60,
                gain_per_hour = gain / hours
      )
     
    #4.1累计聚合公式 cumsum(),cumprop(),cummin(),cummax(),cummean()     
          x<-1:10
          cumsum(x)
          cummean(x)
    
#5 ranking相关min_rank(),默认状态下所有函数中NA均不参与排名
    y<-c(1,2,2,NA,3,4)
    #重复数字取同一排名,跳过相同排名往后取(1,2,2,NA,4,5)
    min_rank(y)
    
    #重复数字取同一排名,降序然后跳过相同排名往后取(5,3,3,NA,2,1)
    min_rank(desc(y)) 
    
    #重复数字取不同排名(1,2,3,NA,4,5)
    row_number(y)
    
    #重复数字取同一排名,不跳过相同排名(1,2,2,NA,3,4)
    dense_rank(y)
    
    #重复数字取同一排名,不跳过相同排名(0.00 0.25 0.25   NA 0.75 1.00)
    percent_rank(y)
    
    #重复数字取同一排名,不跳过相同排名(0.2 0.6 0.6  NA 0.8 1.0),不低于排名的数量/总排名数
    cume_dist(y)
#6 聚合summarise
    summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
    
    #先按照给定字段分组,然后再聚合
      #常规算法
          by_dest <- group_by(flights, dest)
          delay<-summarise(by_dest, count=n()
                    ,dist=mean(distance,na.rm = TRUE)
                    ,delay=mean(arr_delay,na.rm = TRUE))
          delay<-filter(delay,count>20,dest!="HNL")    
          
          ggplot(data=delay,mapping = aes(x=dist,y=delay))+
            geom_point(aes(size=count),alpha=0.3)+
            geom_smooth(se=FALSE)
      #管道函数,其实是常规写法的思路从左到右、从上到下,整体写法与正常思维方式特吻合
      #na.rm=TRUE,剔除NA值部分,否则会把聚合函数都返回NA值结果
            delays<-flights %>%
              group_by(dest) %>%
              summarise(count=n()
                        ,dist=mean(distance,na.rm = TRUE)
                        ,delay=mean(arr_delay,na.rm = TRUE))%>%
              filter(count>20,dest!="HNL")
            ggplot(data=delays,mapping = aes(x=dist,y=delay))+
              geom_point(aes(size=count),alpha=0.3)+
              geom_smooth(se=FALSE)
            
            delay1 <- flights %>% 
              filter(!is.na(dep_delay), !is.na(arr_delay)) %>%
              group_by(tailnum)%>%
              summarise(delay=mean(arr_delay,na.rm=TRUE),n=n())
            ggplot(data=delay1,mapping =aes (x=delay))+geom_freqpoly(binwidth=10)
            
            delay1%>%
              filter(n>25)%>%
            ggplot(data=delay1,mapping = aes(x=n,y=delay))+geom_point(alpha=0.3)
  
            
            #ungroup()移除已有分组    
            popular_dests <- flights %>% 
              group_by(dest) %>% 
              filter(n() > 365) %>% 
              filter(arr_delay > 0) %>% 
              mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
              select(year:day, dest, arr_delay, prop_delay)  
            

②tibble 只显示前十行,与str类似,可显示数据类型,循环输入数据

library(tidyverse)

##,tibble只显示前十行,与str类似,可显示数据类型,循环输入数据(e.g.)
    as_tibble(iris)
                       
    #①
    tibble(
      x = 1:5, 
      y = 1, 
      z = x ^ 2 + y
    )
    
    #②
    tb <- tibble(
      `:)` = "smile", 
      ` ` = "space",
      `2000` = "number"
    )
    tb
  
    
    tribble(
      ~x, ~y, ~z,
      "a", 2, 3.6,
      "b", 1, 8.5
    )
    
    ③
    tibble(
      a = lubridate::now() + runif(1e3) * 86400,
      b = lubridate::today() + runif(1e3) * 30,
      c = 1:1e3,
      d = runif(1e3),
      e = sample(letters, 1e3, replace = TRUE)
    )
#excerise
    #1.How can you tell if an object is a tibble? (Hint: try printing mtcars, which is a regular data frame).
      str(mpg) #Classes ‘tbl_df’, ‘tbl’ and 'data.frame'
    #2.Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration?
      df <- data.frame(abc = 1, xyz = "a")
      df$x      # df1$xyz tibble格式要求的  更为严格
      df[, "xyz"]     
      df[, c("abc", "xyz")]      
      df1<-as.tibble(df)
    
    #3. If you have the name of a variable stored in an object
        #e.g. var <- "mpg", how can you extract the reference variable from a tibble?
          mpg$manufacturer
   
      

③read

  read_csv()逗号分隔

  read_csv2()分号分隔

  read_tsv()  tab分隔

  read_delim()  分隔符分隔

 read_fwf() 固定宽度的文件

   

        #read_csv 跳过N行   
          read_csv("The first line of metadata The second line of metadata
                   x,y,z
                   1,2,3", skip = 2)
        #忽略行标题
          read_csv("1,2,3\n4,5,6", col_names = FALSE)
          
        #添加行标题
          read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
        
        #Na值
          read_csv("a,b,c\n1,2,.", na = ".")
          

   

 

 

 

你可能感兴趣的:(R数据处理学习)