Main Contents
- Describe what a data frame is.
- Load external data from a .csv file into a data frame.
- Summarize the contents of a data frame.
- Describe what a factor is.
- Convert between strings and factors.
- Reorder and rename factors.
- Change how character strings are handled in a data frame.
- Format dates.
1. Preparation for a dataframe
1.1 Download file
download.file("https://ndownloader.figshare.com/files/2292169",
"C:/Users/home/Desktop/Rcourse/DataCarpentry33/RawData/portal_data_joined1.csv")
1.2 Load file
survey <- read.csv("C:/Users/home/Desktop/Rcourse/DataCarpentry33/RawData/portal_data_joined1.csv")
surveys <- read.table(file="C:/Users/home/Desktop/Rcourse/DataCarpentry33/RawData/portal_data_joined1.csv", sep=",", header=TRUE)
2. Inspecting dataframe objects
2.1 Contents
- head(): shows the first 6 rows
- tail(): shows thelast 6 rows
2.2 Size
- dim(): returns a vector with the number of rows in the first element, and the number of columns as the second element (the dimensions of the object)
- nrow(): returns the number of rows
- ncol(): returns the number of columns
2.3 Names
- names(): returns the column names (synonym of colnames() for data.frame objects)
- colnames()
- rownames(): returns the row names
2.4 Summary
- str(): structure of the object and information about the class, length and content of each column
- summary(): summary statistics for each column
3. Indexing and subsetting data frames
3.1 Indexing for data
v1 <- survey[1,1] #第1行第1列,vector
v2 <- survey[1,] #第1行,dataframe
v3 <- survey[,1] #第1列,vector
v4 <- survey[1] #第1列,dataframe
v5 <- survey[1:3,6] #第6列的1-3个元素 (未得到预期?),vector
v6 <- survey[1:6,] #前6行,同head(survey),dataframe
3.2 Exclude certain indices of a data frame using the “-” sign
survey[,-1] #去除第一列的整个表格
survey[-c(7:34786),] #前6行,相当于head(survey)
3.3 Subsetting by calling indices or column names
a <- survey["species_id"] #dataframe,某列表
b <- survey[,"species_id"] #某列组成的vector
c <- survey[["species_id"]]#同上,某列组成的vector
d <- survey$species_id # 某列组成的vector
4. Factors
4.1 Basic factors
sex <- factor(c("male","female","female","male"))
sex
levels(sex) #male female
nlevels(sex) #2
sex <- factor(sex,levels = c("male","female"))##不太懂此处含义
sex#未发现新的变化
4.2 Converting factors
sex_t <- as.character(sex) # convert to character
sex_t
year_fct <- factor(c(1990, 1983, 1977, 1998, 1990))
as.numeric(year_fct) # Wrong! And there is no warning...
as.numeric(as.character(year_fct)) # Works...
as.numeric(levels(year_fct))[year_fct] # The recommended way.
Three steps to transfer into numberic:
- We obtain all the factor levels using levels(year_fct).
- We convert these levels to numeric values using as.numeric(levels(year_fct)).
- We then access these numeric values using the underlying integers of the vector year_fct inside the square brackets.
4.3 Renaming factors
plot(survey$sex) #view
sex <- survey$sex
sex
head(sex)
levels(sex)
nlevels(sex)
levels(sex)[1] <- "determined"
levels(sex)
head(sex)
levels(sex)[2:3] <- c("Female", "Male")
levels(sex)
sex <- factor(sex,levels = c("Female","Male","undetermined"))
levels(sex)
plot(sex)#名称已换,但数值对不上号
4.4 Using stringsAsFactors = FALSE
Compare the difference between read as "factors" vs "character"
surveys1 <- read.csv("C:/Users/home/Desktop/Rcourse/DataCarpentry33/RawData/portal_data_joined1.csv",stringsAsFactors = TRUE)
str(surveys1)
surveys2 <- read.csv("C:/Users/home/Desktop/Rcourse/DataCarpentry33/RawData/portal_data_joined1.csv",stringsAsFactors = FALSE)
str(surveys2)
surveys2$plot_type <- factor(surveys2$plot_type)
str(surveys2) #Convert the column "plot_type" into a factor
Practice
animal_data <- data.frame(animal = c("dog", "cat", "sea cucumber", "sea urchin"), feel = c("furry", "squishy", "spiny","sweety"),weight = c(45, 8,1.1, 0.8))
country_climate <- data.frame(country = c("Canada", "Panama", "South Africa", "Australia"), climate = c("cold", "hot", "temperate", "hot/temperate"), temperature = c(10, 30, 18, "15"), northern_hemisphere = c(TRUE, TRUE, FALSE, "FALSE"), has_kangaroo = c(FALSE, FALSE, FALSE, 1), stringsAsFactors = FALSE)
str(country_climate)
country_climate <- data.frame(country = c("Canada", "Panama", "South Africa", "Australia"),
climate = c("cold", "hot", "temperate", "hot/temperate"),
temperature = c(10, 30, 18, 15),
northern_hemisphere = c(TRUE, TRUE, FALSE, FALSE),
has_kangaroo = c(FALSE, FALSE, FALSE, TRUE), stringsAsFactors = FALSE)
str(country_climate)
4.5 Formatting Dates
library(tidyverse)
library(lubridate)
my_date <- ymd("2015-01-01")
str(my_date)
# sep indicates the character to use to separate each component
my_date <- ymd(paste("2015", "1", "1", sep = "-"))
str(my_date)
paste(survey$year, survey$month, survey$day, sep = "-")
ymd(paste(survey$year, survey$month, survey$day, sep = "-"))
survey$date <- ymd(paste(survey$year, survey$month, survey$day, sep = "-"))
view(survey)
str(survey)
summary(survey)
summary(survey$date)
# considering the missing data (未找到正确解决方式)
is_missing_date <- is.na(survey$date)
date_columns <- c("year", "month", "day")
missing_dates <- survey[is_missing_date, date_columns]
head(missing_dates)
下期预告
Data Manipulation using dplyr and tidyr
Data visualization with ggplot2