Coursera代码笔记:Getting and cleaning data (1)

1. Download Data

if(!file.exists("data")) {    dir.create("data")}

fileUrl<-"https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"

download.file(fileUrl,destfile="./data/cameras.csv",method="curl")

list.files("./data")

2. Reading Local File (.csv)

cameraData<-read.table("./data/cameras.csv",sep=",",header=TRUE)

head(cameraData)

3. Reading Excel File (.xlsx)

library(xlsx)

cameraData<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,header=TRUE)

head(cameraData)

## Reading specific rows and columns

colIndex<-2:3

rowIndex<-1:4

cameraDataSubset<-read.xlsx("./data/cameras.xlsx",sheetIndex=1,colIndex=colIndex,rowIndex=rowIndex)

cameraDataSubset

3. Reading XML and HTML

library(XML)

fileUrl<-"http://www.w3schools.com/xml/simple.xml"

doc<-xmlTreeParse(fileUrl,useInternal=TRUE)

rootNode<-xmlRoot(doc)

xmlName(rootNode)   #查看文件标题

names(rootNode)   #查看所有子主题

rootNode[[1]]  #查看子主题第一级

rootNode[[1]][[1]]  #查看子主题第一级的第一个Element

xmlSApply(rootNode,xmlValue)  #查看所有Element的Value

XPath:

/nodeTop level node

//nodeNode at any level

node[@attr-name]Node with an attribute name

node[@attr-name='bob']Node with attribute name attr-name='bob'

Information from:http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf

xpathSApply(rootNode,"//name",xmlValue)

xpathSApply(rootNode,"//price",xmlValue)

fileUrl<-"http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"doc<-htmlTreeParse(fileUrl,useInternal=TRUE)scores<-xpathSApply(doc,"//li[@class='score']",xmlValue)teams<-xpathSApply(doc,"//li[@class='team-name']",xmlValue)scores


4. Reading JSON

library(jsonlite)

jsonData

names(jsonData)

jsonData$name

names(jsonData$owner)

jsonData$owner$login

#Writing data frames to JSON

myjson<-toJSON(iris,pretty=TRUE)

cat(myjson)

#Convert back to JSON

iris2<-fromJSON(myjson)

head(iris2)

5. Data Table

library(data.table)

DF=data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))

head(DF,3)


DT=data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))head(DT,3)

# See all data tables in Memory

tables()

# Subsetting rows

DT[2,]

DT[DT$y=="a",]   #选出y=a的

DT[c(2,3)]  #选出行12,列123

# Calculating values for variables with expressions

DT[,list(mean(x),sum(z))]  #返回x的mean,z的sum两个值

# Adding new columns

DT[,w:=z^2]

# 多重操作,tep意指中间变量

DT[,m:={tmp<-(x+z); log2(tmp+5)}]

# plyr like operations

DT[,a:=x>0]  #增加一个变量 true false

DT[,b:=mean(x+w),by=a]  #by语句

# Special Variable

.N  An integer, length 1, containing the number of elements of a factor level

set.seed(123);

DT<-data.table(x=sample(letters[1:3],1E5,TRUE))

DT[, .N,by=x]

# Keys (重要)

DT<-data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))

setkey(DT,x)

DT['a'] 

# Fread指令 Fast reading

big_df<-data.frame(x=rnorm(1E6),y=rnorm(1E6))

file<-tempfile()write.table(big_df,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)

system.time(fread(file))

你可能感兴趣的:(Coursera代码笔记:Getting and cleaning data (1))