RCurl包批量爬取美女图片

#### 计算程序的运行时间
timestart<-Sys.time();
#打印开始时间
print(timestart)
####这块写你要运行的程序


getwd()
setwd("./我的R/RCurl包学习/")
library(RCurl)
library(XML)
library(stringr)
myheader<-c(
  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
url <- "http://www.zbjuran.com/mei/"
url_ori <- "http://www.zbjuran.com"
wp<-getURL(url,.encoding="gb2312",followlocation=T) #用网页本身的编码
doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
sex_url <- xpathSApply(doc,path = "//a",xmlGetAttr,"href")
sex_url <- sex_url[which(str_detect(sex_url,"[0-9]{4}"))]
sex_url <- paste(url_ori,sex_url,sep = "")
sex_url <- as.factor(sex_url)
sex_url <- levels(sex_url)

x <- NULL;
for(j in 2:30)
  {
    rp <- paste("_",j,".html",sep = "")
    x <- c(x,str_replace(sex_url,pattern = ".html",replacement = rp))
}
sex_url <- c(x,sex_url)
  



i=1
for(i in 1:length(sex_url))
{
  if(url.exists(sex_url[i])){
  wp1<-getURL(sex_url[i],.encoding="gb2312",followlocation=T) #用网页本身的编码
  doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
  
 url_pic <- paste(url_ori,xpathSApply(doc1,path = "//div[@class='picbox']//img",xmlGetAttr,"src"),sep = "") 
 temp <- getBinaryURL(url_pic);
 #下载每个.R文件的二进制html文件
 note <- file(str_split(url_pic,pattern = "/")[[1]][7],open="wb")
 #打开文件,对该文件进行二进制写入操作,文件的名字为.R文件的名字
 writeBin(temp,note)
 #将temp文件写入连接note中
 close(note)
 #关闭文件,运行成功后即可下载完毕
 #Sys.sleep(time = 30)
 #每爬一次休息一会儿
}
 
}
 
  


#计算程序结束时间
timeend<-Sys.time()
#打印结束时间
print(timeend)
runningtime<-timeend-timestart
#输出时间消耗 
print(runningtime)
  

你可能感兴趣的:(RCurl包批量爬取美女图片)