基于RCurl包的爬虫技术

library(RCurl)
library(XML)
myHttpheader <- c(

  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",

  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

  "Accept-Language"="en-us",

  "Connection"="keep-alive",

  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"

)
norm<-read.table('正常号码.txt')
unorm<-read.table('逾期号码.txt')
code58<-read.table('58号码.txt')
url<-c()
doc<-list()
length(doc)<-100
find.msg<-function(file){
for(i in 1:nrow(file)){
    url[i]<-paste('http://www.baidu.com/baidu?tn=56060048_4_pg&ie=utf-8&word=',file[i,1],'&searchRadio=on',sep='')
  } 
}
url<-find.msg(file)
temp<-lapply(url,function(x) getURL(x,httpheader=myHttpheader,encoding='utf-8'))
pagetree<-lapply(temp,function(x) htmlTreeParse(x,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE))
news_name<-lapply(pagetree,function(x) xpathSApply (x, "//*/h3[@class='t']", xmlValue))
news_body<-lapply(pagetree,function(x) xpathSApply (x, "//*/div [@class='c-abstract']",xmlValue))
for(i in 1:length(news_name)){
  for(j in 1:length(news_name[[i]])){
  doc[[i]][j]<-paste(news_name[[i]][j],news_body[[i]][j],sep='\n')
  }
}
f<-function(file){
  for(i in 1:nrow(file)){
  write.table(doc[[i]],paste('正常号码','/',file[i,1],'.txt',sep=''))
  }
}
f1<-function(file){
  for(i in 1:nrow(file)){
    write.table(doc[[i]],paste('预期号码','/',file[i,1],'.txt',sep=''))
  }
}
f2<-function(file){
  for(i in 1:nrow(file)){
    write.table(doc[[i]],paste('58号码','/',file[i,1],'.txt',sep=''))
  }
}










你可能感兴趣的:(R)