传送:RSelenium基本用法总结
Python与R协同完成【中国裁判文书网】文书内容爬取
#以执业机构(即律师事务所)作为条件输入
system("java -jar F:/.../SeleniumSever/selenium-server-standalone-3.8.1.jar",wait=F)
system("java -Dwebdriver.chrome.driver=F:/.../SeleniumSever/chromedriver.exe",wait=F)
setwd("F:/...")
library("readxl")
library("magrittr")
library("RSelenium")
library("rvest")
library("dplyr")
#读取之前爬的律师事务所名称
law_firm_info<-read_xls("law_firm_info.xls") %>% select(c(1,4)) %>% unique()
remDr<-remoteDriver(browserName="chrome")
remDr$open()
url<-"http://wenshu.court.gov.cn/"
#函数1:审理程序+文书名称+审理法院+文书字号+审理日期
get_info_one<-function(tian){
trial_procedure<-tian %>% '[['(1) %>% read_html() %>% html_nodes(xpath="//*[@id='resultList']/div/div[1]") %>% html_text() #审理程序
document_name<-tian %>% '[['(1) %>% read_html() %>% html_nodes(xpath="//*[@id='resultList']/div/table/tbody/tr[1]/td/div/a[2]") %>% html_text() #文书名称
tmp<-tian %>% '[['(1) %>% read_html() %>% html_nodes(xpath="//*[@id='resultList']/div/table/tbody/tr[2]/td/div") %>% html_text() #审理法院+文书字号+审理日期
tmp_2<-data.frame()
for (j in seq_along(tmp)) {
tmp_1<-unlist(strsplit(tmp[j],"\u00A0\u00A0\u00A0\u00A0"))
tmp_1<-data.frame(court_name=tmp_1[1],document_number=tmp_1[2],date_trial=tmp_1[3],stringsAsFactors=F)
tmp_2<-rbind(tmp_2,tmp_1)
rm(tmp_1)
}
gg<-cbind(trial_procedure,document_name,tmp_2,stringsAsFactors=F)
return(gg)
}
#函数2:窗口切换,只允许在两个窗口间不断切换
window_switch_gettext<-function(tian) {
data_frame_huizong<-data.frame()
for (j in seq_along(as.character(tian$document_number))) {
css_tmp<-paste0("#resultList > div:nth-child(",j,") > table > tbody > tr:nth-child(1) > td > div > a:nth-child(4)")
btn<-remDr$findElement(using='css selector',value=css_tmp)
btn$clickElement() #此时两个窗口
Sys.sleep(runif(1,9,15))
rm(css_tmp)
#将窗口切换至新打开的窗口-即第二个窗口-使用窗口句柄函数
remDr$switchToWindow(windowId=remDr$getWindowHandles()[[2]])
#在第二个打开窗口-获取文书的详细信息-字符串格式
text_tmp<-remDr$getPageSource() %>% '[['(1) %>% read_html() %>% html_nodes("#DivContent div") %>% html_text()
data_frame_tmp<-as.data.frame(cbind(as.character(tian$document_number)[j],text_tmp),stringsAsFactors=F)
rm(text_tmp)
colnames(data_frame_tmp)<-c("document_number","text")
data_frame_tmp<-data_frame_tmp[which(nchar(as.character(data_frame_tmp$text))>1),]
data_frame_huizong<-rbind(data_frame_huizong,data_frame_tmp)
rm(data_frame_tmp)
#将第二个窗口关闭
remDr$closeWindow()
#将窗口切换到初始窗口
remDr$switchToWindow(windowId=remDr$getWindowHandles()[[1]])
}
return(data_frame_huizong)
}
document_basic_info<-data.frame() #存储文书基本信息
document_text_info<-data.frame() #存储文书详细信息
remDr$navigate(url)
btn<-remDr$findElement(using='css selector',value='#head_maxsearch_btn') #点击高级搜索
btn$clickElement()
#定位律所文本框+输入文本
btn<-remDr$findElement(using='css selector',value='#adsearch_LS')
btn$sendKeysToElement(as.list(as.character(law_firm_info[1,1])))
#定位搜索按钮+点击
btn<-remDr$findElement(using='css selector',value='#list_btnmaxsearch')
btn$clickElement()
Sys.sleep(runif(1,5,10))
rm(btn)
#判断搜索结果是否有数据,如果没有数据则进入下个事务所(这个语句要写在循环里面,作为一个条件判断)
if ((remDr$getPageSource() %>% '[['(1) %>% read_html() %>% html_nodes(xpath="//*[@id='resultList']") %>% html_text())=="无符合条件的数据...")
next
#搜索结果有数据:获取当前界面的文书基本信息
tmp_tian<-get_info_one(remDr$getPageSource())
tmp_tian$law_firm<-as.character(law_firm_info[1,1])
document_basic_info<-rbind(document_basic_info,tmp_tian)
#获取文书详细内容
document_text_tmp<-window_switch_gettext(tmp_tian)
document_text_info<-rbind(document_text_info,document_text_tmp)
rm(tmp_tian)
rm(document_text_tmp)
gc()
#以上信息均是首页数据
#循环点击下一页爬取数据
j=1
while (1==1) {
j<-j+1
trycatch_value_2<-tryCatch(
{
#定位[下一页]按钮,仍然为一个窗口
btn<-remDr$findElement(using='css selector',value='#pageNumber > a.next');
btn$clickElement();1+1
},
error=function(e) return(paste0(law_firm_info[1,1],"-律师事务所的第",j,"页,在点击下一页时出现错误,错误信息:",e$message))
)
if (trycatch_value_2 != 2) {print(trycatch_value_2);break} else {
Sys.sleep(runif(1,10,17))
#判断页面是否刷新出来--当前页的页码 ?=j
trycatch_value_3<-tryCatch(
{
m<-0;
for (n in 1:10) {
m<-m+1
page_tmp<-remDr$getPageSource() %>% '[['(1) %>% read_html() %>% html_nodes("#pageNumber > span") %>% html_text()
if (as.numeric(page_tmp)==j) break else
{Sys.sleep(runif(1,10,17)+m);next}
};1+1
},
error=function(e) return(paste0(law_firm_info[1,1],"-律师事务所的第",j,"页,页面未刷新出来,错误信息:",e$message))
)
if (trycatch_value_3 != 2) {print(trycatch_value_3);break} else {
trycatch_value_1<-tryCatch(
{
tmp_tian<-get_info_one(remDr$getPageSource());
tmp_tian$law_firm<-as.character(law_firm_info[1,1]);
document_basic_info<-rbind(document_basic_info,tmp_tian);
document_text_tmp<-window_switch_gettext(tmp_tian);
document_text_info<-rbind(document_text_info,document_text_tmp);
rm(tmp_tian);
rm(document_text_tmp);
1+1},
error=function(e) return(paste0(law_firm_info[1,1],"-律师事务所的第",j,"页在抓取详细信息时出现错误,错误信息:",e$message))
)
if (trycatch_value_1 != 2) {print(trycatch_value_1);break}
}
}
}
#数据存储
write.table(document_basic_info,"document_basic_info.csv",append = F,col.names = T,row.names = F,sep=",")
write.table(document_text_info,"document_text_info.csv",append = F,col.names = T,row.names = F,sep=",")