使用R包rvest/XML/RCurl入门R爬虫

css selector手册:https://www.runoob.com/cssref/css-selectors.html
xpath selector手册:https://www.runoob.com/xpath/xpath-tutorial.html
xpath查找节点:https://www.cnblogs.com/txwen/p/7999485.html
学习视频:https://www.bilibili.com/video/av30320885 https://www.bilibili.com/video/av36907341?from=search&seid=12650656636924845868 https://www.bilibili.com/video/av39807071?p=7
参考文档:https://zhuanlan.zhihu.com/p/22916652
关于GET和POST的有趣的解释:https://zhuanlan.zhihu.com/p/22536382
RCurl解析:https://blog.csdn.net/kMD8d5R/article/details/78933384
html文件及http基本知识:https://www.w3school.com.cn/tags/html_ref_byfunc.asp
post/get格式化工具:http://coolaf.com
如何查看post的参数:https://jingyan.baidu.com/article/d45ad1487f057669552b8030.html
一个爬虫初学者友好的教程:https://www.jianshu.com/p/0c0cb9867b44
多个网站的抓取实操:https://blog.csdn.net/hill_night/article/details/45789655?locationNum=12&fps=1
抓取财经网股票信息实操:http://blog.sina.com.cn/s/blog_685d10480102wyn9.html
利用postform模拟登录抓取新浪微博信息:http://www.dataguru.cn/article-873-1.html
一个通过模拟登录抓取教务处信息的案例:https://blog.csdn.net/kMD8d5R/article/details/78737442
模拟登录需要输入用户名密码的网页:https://www.zhihu.com/question/65799576
模拟登录可能需要的cookie的解释:https://www.jianshu.com/p/6fc9cea6daa2
通过替换cookie来模拟登录:https://www.cnblogs.com/huahuayu/p/8207037.html
通过get方式提交request时如果是汉字会自动变成url编码,有关这类编码的介绍:https://www.cnblogs.com/niuyaomin/p/11788732.html
url编码的解析器:http://web.chacuo.net/charsetbase64
不错的爬虫实例:https://ask.hellobi.com/blog/R_shequ/33920
rvest模拟浏览行为:https://blog.csdn.net/weixu22/article/details/79237512
https://blog.csdn.net/Joyliness/article/details/78722317
高级爬虫教程:https://www.jianshu.com/p/1fc6a6817160
rvest模拟点击网页:https://www.jb51.cc/html/224799.html

library(rvest)
url_eye<-"https://list.tmall.com/search_product.htm?q=%D1%DB%BD%DE%C3%AB&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton"
url_eye
html_session(url_eye)
web_eye<-read_html(url_eye,encoding = "GBK")
pic<-html_nodes(web_eye,xpath = '//div[@class="view grid-nosku "]//div[@class="productImg-wrap"]//img')
pic
pic_dir<-html_attr(pic,"src")
pic_dir<-html_attr(pic,"data-ks-lazyload")
pic_dir<-pic_dir[!is.na(pic_dir)]
pic_dir<-paste("http:",pic_dir,sep = "")
for (i in 1:55) {
  download.file(pic_dir[i],paste(i,".jpg",sep = ""))
}
#爬取GEO中的样本编号和名称!!!
#注意:1、时刻查看xpath与真实下载的html文件的差异,因为浏览器上展示的信息可能与真实下载的文件信息不同!
#2、当输入xpath后获得的节点为null时,可以使用‘/div’来验证html文件是否有效!
library(XML)
library(RCurl)
library(rvest)
url<-"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056"
myheader<-c("User-Agent"= "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
myheader
temp<-getURL(url,httpheader=myheader,.encoding = "utf-8")
temp
temp<-htmlParse(temp)
temp
nodes<-getNodeSet(temp,path = '//table[@style="position:relative;top:-5px;left:-5px"]/tr')
nodes
value<-sapply(nodes,xmlValue)
value

#爬取大众点评上的信息,注意:1、淘宝和天猫需要登录才能爬取(或许是别的原因)2、如果怀疑html文件有问题,可以保存为html后用浏览器打开查看!
library(RCurl)
library(XML)
url<-"http://www.dianping.com"
myheader<-c(
  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
temp<-getURL(url,httpheader=myheader,.encoding = "utf-8")
(temp)
write.table(temp,"temp.html")
temp<-htmlParse(temp)
temp<-getNodeSet(temp,'//div[@class="shop-item"]//img')
temp
temp<-lapply(temp,xmlAttrs)#提取节点所有属性值
temp<-sapply(temp,function(x){x=x[names(x)=="lazy-src"];return(x)})#提取节点某一属性值
temp


library(rvest)
url<-"http://www.dianping.com"
temp<-read_html(url,encoding = "utf-8")
node<-html_nodes(temp,xpath = '//div[@class="shop-item"]//img')%>%html_attr(name = "src")
node
url<-"http://www.baidu.com"
#rvest

web<-read_html(url,encoding = "utf-8")
web
#html_document
temp1<-html_nodes(web,xpath = "//img")
temp1
#[1] "xml_nodeset"
attrs<-html_attrs(temp1)
#没有乱码


#rcurl+xml
library(RCurl)
library(XML)
html_session(url)
web<-getURL(url = url,.encoding = "utf-8")
web
#character
temp<-htmlParse(web)
class(temp)
#"HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument"  "XMLAbstractDocument"
#temp<-htmlTreeParse("./1688.html",encoding = "UTF-8")
#temp
#"XMLDocumentContent"
temp<-getNodeSet(temp,"//img")
temp
#[1] "XMLNodeSet"
attrs2<-lapply(temp,xmlAttrs)
#有乱码
iconv(attrs2[[4]],from = "UTF-8")
#可以消除乱码
#使用爬虫解析本地html
#注:淘宝天猫京东等暂时无法通过url解析和本地解析的方法爬取信息
#rvest
web<-read_html("./dazhongdianping.html",encoding = "utf-8")
web
nodes<-html_nodes(web,xpath = "//img")
nodes


#RCurl+XML
web1<-htmlParse("./dazhongdianping.html",encoding = "utf-8")
web1
nodes1<-getNodeSet(web1,"//img")
nodes1
#注意!!!!根据getnodesset和html_nodes的说明文件,目前其仅支持xpath 1.0 selector ,同时在出现[]是@属性="xxxxxx xxxxx"不被认可(中间用空格无法识别)建议换成//div[contains(@class,'')],注意使用单引号!!!!
url<-"https://sh.lianjia.com/ershoufang/"
web2<-read_html(url,encoding = "utf-8")
web2<-html_nodes(web2,xpath = "//ul[@class='sellListContent']/li[contains(@class,'clear')]/a")#这样也可以正常运行,但是要用单引号!!
web2
#成功爬取
#若使用//ul[@class="sellListContent"]/li[@class="clear  LOGCLICKDATA"]/a    则爬取失败!!

你可能感兴趣的:(使用R包rvest/XML/RCurl入门R爬虫)