1. Reading from MySQL

Step 1 - Install MySQL

Step 2 - Install RMySQL - install.packages("RMySQL")

Connecting and listing databases

ucscDb<-dbConnect(MySQL(),user="genome",host="genome-mysql.cse.ucsc.edu")

# 创造句柄

result<-dbGetQuery(ucscDb,"show databases;");

＃赋值给result

dbDisconnect(ucscDb);

Connecting to hg19 and listing tables

hg19<-dbConnect(MySQL(),user="genome",db="hg19",host="genome-mysql.cse.ucsc.edu") ＃连接db

allTables<-dbListTables(hg19)

length(allTables) ＃求出db中有多少个表

dbListFields(hg19,"affyU133Plus2") ＃求表中有多少列

dbGetQuery(hg19,"select count(*) from affyU133Plus2") ＃求表有多少行

Read from the table

affyData<-dbReadTable(hg19,"affyU133Plus2")

head(affyData)

Select a specific subset

query<-dbSendQuery(hg19,"select * from affyU133Plus2 where misMatches between 1 and 3")

affyMis<-fetch(query);

quantile(affyMis$misMatches)

affyMisSmall<-fetch(query,n=10);

dbClearResult(query);

Don't forget to close the connection!

dbDisconnect(hg19)

2. HDF5 （Heirarchical data format）

R HDF5 package

source("http://bioconductor.org/biocLite.R")

biocLite("rhdf5")

library(rhdf5)

created=h5createFile("example.h5")created

Create groups

created=h5createGroup("example.h5","foo")

created=h5createGroup("example.h5","baa")

created=h5createGroup("example.h5","foo/foobaa")

h5ls("example.h5")

Write to groups

A=matrix(1:10,nr=5,nc=2)

h5write(A,"example.h5","foo/A")

B=array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))

attr(B,"scale")<-"liter"

h5write(B,"example.h5","foo/foobaa/B")

h5ls("example.h5")

Write a data set

df=data.frame(1L:5L,seq(0,1,length.out=5), c("ab","cde","fghi","a","s"),stringsAsFactors=FALSE)

h5write(df,"example.h5","df")

h5ls("example.h5")

Reading data

readA=h5read("example.h5","foo/A")

readB=h5read("example.h5","foo/foobaa/B")

readdf=h5read("example.h5","df")

readA

Writing and reading chunks

h5write(c(12,13,14),"example.h5","foo/A",index=list(1:3,1))

h5read("example.h5","foo/A")

3. Webscraping （HTML）

Getting data off webpages - readLines()

con=url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")

htmlCode=readLines(con)

close(con)

htmlCode

Parsing with XML

library(XML)

url<-"http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"

html<-htmlTreeParse(url,useInternalNodes=T)

xpathSApply(html,"//title",xmlValue)

xpathSApply(html,"//td[@id='col-citedby']",xmlValue)

GET from the httr package

library(httr);

html2=GET(url)

content2=content(html2,as="text")

parsedHtml=htmlParse(content2,asText=TRUE)

xpathSApply(parsedHtml,"//title",xmlValue)

Accessing websites with passwords

pg2=GET("http://httpbin.org/basic-auth/user/passwd", authenticate("user","passwd"))

pg2

~~Response [http://httpbin.org/basic-auth/user/passwd]~~

~~Status: 200~~

~~Content-type: application/json~~

{

~~"authenticated": true,~~

~~"user": "user"~~

}

Using handles

google=handle("http://google.com")

pg1=GET(handle=google,path="/")

pg2=GET(handle=google,path="search")

R Bloggers has a number of examples of web scrapinghttp://www.r-bloggers.com/?s=Web+Scraping

4. API （Application Performance Interfaces）

Step1. Creating an application

Step2. Accessing Twitter from R

myapp=oauth_app("twitter",key="yourConsumerKeyHere",secret="yourConsumerSecretHere")

sig=sign_oauth1.0(myapp,token="yourTokenHere",token_secret="yourTokenSecretHere")

homeTL=GET("https://api.twitter.com/1.1/statuses/home_timeline.json",sig)

Converting the json object

json1=content(homeTL)

json2=jsonlite::fromJSON(toJSON(json1))

json2[1,1:4]

httr allowsGET,POST,PUT,DELETErequests if you are authorized, httr works well with Facebook, Google, Twitter, Githb, etc.

5. Reading From other Sources

STATA, SPSS, SAS

Image

GIS

music

Coursera代码笔记：Getting and cleaning data（2）

1. Reading from MySQL

2. HDF5 （Heirarchical data format）

3. Webscraping （HTML）

4. API （Application Performance Interfaces）

5. Reading From other Sources

你可能感兴趣的:(Coursera代码笔记：Getting and cleaning data（2）)