1. Download Data

if(!file.exists("data")) { dir.create("data")}

fileUrl<-"https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"

download.file(fileUrl,destfile="./data/cameras.csv",method="curl")

list.files("./data")

2. Reading Local File （.csv）

cameraData<-read.table("./data/cameras.csv",sep=",",header=TRUE)

head(cameraData)

3. Reading Excel File （.xlsx）

library(xlsx)

cameraData<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,header=TRUE)

head(cameraData)

## Reading specific rows and columns

colIndex<-2:3

rowIndex<-1:4

cameraDataSubset<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,colIndex=colIndex,rowIndex=rowIndex)

cameraDataSubset

3. Reading XML and HTML

library(XML)

fileUrl<-"http://www.w3schools.com/xml/simple.xml"

doc<-xmlTreeParse(fileUrl,useInternal=TRUE)

rootNode<-xmlRoot(doc)

xmlName(rootNode) #查看文件标题

names(rootNode) ＃查看所有子主题

rootNode[[1]] ＃查看子主题第一级

rootNode[[1]][[1]] ＃查看子主题第一级的第一个Element

xmlSApply(rootNode,xmlValue) ＃查看所有Element的Value

XPath：

/nodeTop level node

//nodeNode at any level

node[@attr-name]Node with an attribute name

node[@attr-name='bob']Node with attribute name attr-name='bob'

Information from:http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf

xpathSApply(rootNode,"//name",xmlValue)

xpathSApply(rootNode,"//price",xmlValue)

fileUrl<-"http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"doc<-htmlTreeParse(fileUrl,useInternal=TRUE)scores<-xpathSApply(doc,"//li[@class='score']",xmlValue)teams<-xpathSApply(doc,"//li[@class='team-name']",xmlValue)scores

4. Reading JSON

library(jsonlite)

jsonData
names(jsonData)

jsonData$name

names(jsonData$owner)

jsonData$owner$login

＃Writing data frames to JSON

myjson<-toJSON(iris,pretty=TRUE)

cat(myjson)

＃Convert back to JSON

iris2<-fromJSON(myjson)

head(iris2)

5. Data Table

library(data.table)

DF=data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))

head(DF,3)

DT=data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))head(DT,3)

# See all data tables in Memory

tables()

# Subsetting rows

DT[2,]

DT[DT$y=="a",] #选出y＝a的

DT[c(2,3)] ＃选出行12，列123

＃ Calculating values for variables with expressions

DT[,list(mean(x),sum(z))] ＃返回x的mean，z的sum两个值

＃ Adding new columns

DT[,w:=z^2]

＃多重操作，tep意指中间变量

DT[,m:={tmp<-(x+z); log2(tmp+5)}]

＃ plyr like operations

DT[,a:=x>0] ＃增加一个变量 true false

DT[,b:=mean(x+w),by=a] ＃by语句

＃ Special Variable

.N An integer, length 1, containing the number of elements of a factor level

set.seed(123);

DT<-data.table(x=sample(letters[1:3],1E5,TRUE))

DT[, .N,by=x]

＃ Keys （重要）

DT<-data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))

setkey(DT,x)

DT['a']

＃ Fread指令 Fast reading

big_df<-data.frame(x=rnorm(1E6),y=rnorm(1E6))

file<-tempfile()write.table(big_df,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)

system.time(fread(file))

Coursera代码笔记：Getting and cleaning data （1）

1. Download Data

2. Reading Local File （.csv）

3. Reading Excel File （.xlsx）

3. Reading XML and HTML

4. Reading JSON

你可能感兴趣的:(Coursera代码笔记：Getting and cleaning data （1）)