#批量爬取下载文件-httr包实现异步加载+POST表单提交+cookie登入
#异步加载网页大多通过返回json字符串形式来获取数据
setwd("F:/...")
url_yuan="http://www.zjsfgkw.cn"
url_post="http://www.zjsfgkw.cn/document/JudgmentSearch"
library("httr")
library("dplyr")
library("jsonlite")
library("curl")
library("RCurl")
library("XML")
library("downloader")
h<-handle(url_yuan)
GET(handle=h,config=verbose()) #GET请求url_yuan并保存cookies
#构造浏览器报头信息:
headers <- c('Content-Type'='application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent'='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer'='http://www.zjsfgkw.cn/Document/JudgmentBook',
'Host'='www.zjsfgkw.cn',
'Origin'='http://www.zjsfgkw.cn',
'Connection'='keep-alive'
)
#构造请求头参数信息-只接受list
payload<-list(
'pageno'=1,
'ajlb'="民事",
'pagesize'=200,
'jarq1'=20080101,
'jarq2'=20180812
)
#单步执行-获取文书基本信息
r<-POST(url_post,add_headers(.headers=headers),body=payload,encode="form",handle=h,use_proxy('219.141.153.41',80),verbose())
result_list<-r %>% content() %>% '[['(1)
mydata<-do.call(rbind,result_list) %>% as.data.frame()
rm(result_list)
rm(r)
mydata<-data.frame(DocumentId=as.character(mydata$DocumentId),JARQ=as.character(mydata$JARQ),AJLB=as.character(mydata$AJLB),CourtName=as.character(mydata$CourtName),AH=as.character(mydata$AH),CreateTime=as.character(mydata$CreateTime),stringsAsFactors=F) #格式转化-存储文书基本信息
#下载文件
for(i in 1:nrow(mydata)){
documentid<-mydata$DocumentId[i]
AH<-mydata$AH[i]
CourtName<-mydata$CourtName[i]
JARQ<-mydata$JARQ[i]
url<-paste0("http://www.zjsfgkw.cn/document/JudgmentDetail/",documentid)
rd<-getURL(url,.encoding="utf-8") %>% htmlParse() %>% getNodeSet("//h6//a") %>% '[['(1) %>% xmlGetAttr(name='href')
url_download<-paste(url_yuan,rd,sep="") #解析出下载的url地址
Encoding(url_download)<-"UTF-8"
url_download<-iconv(url_download,"UTF-8","CP936")
download(url_download,paste(AH,"_",CourtName,"_",JARQ,".pdf",sep=""),mode="wb")
Sys.sleep(runif(1,0.5,0.8))
}