作者:张昌昌
动态网页指几种可能:1)需要用户交互,如常见的登录操作;2)网页通过JS / AJAX动态生成,如一个html里有,通过JS生成
第一种方法:直接url法
(1)认真分析页面结构,查看js响应的动作;
(2)借助于firfox的firebug分析js点击动作所发出的请求url;
(3)将此异步请求的url作为scrapy的start_url或yield request再次进行抓取。
第二种方法:借助于selenium
Selenium基于Javascript 并结合其WebDriver来模拟用户的真实操作,它有很好的处理Ajax的能力,并且支持多种浏览器(Safari,IE,Firefox,Chrome),可以运行在多种操作系统上面,Selenium可以调用浏览器的API接口,selenium 会打开一个浏览器,然后在新打开的浏览器中执行程序中模拟的动作。
以抓取国家药品食品监督管理网站的食品数据为例进行说明:首先将该网站作为start_url,通过selenium获取一级品类(食品)下的二级品类(120,145275419693611287728573704379),模拟该二级品类被点击,解析完后,获取翻页按钮不停点击下一页,直到不能点击为止,在点击过程需要等待一段时间,让相应的元素渲染完成为止。
#-*-coding:utf-8-*-
import codecs
import json
import sys
import time
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import Spider
from scrapy.utils.project import get_project_settings
from scrapy.utils.response import get_base_url
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.wait import WebDriverWait
from etao.items import EtaoItem
from lib2to3.pgen2.tokenize import Ignore
reload(sys)
sys.setdefaultencoding('utf-8')
class ProductSpider(Spider):
name ="product1_spider"
allowed_domains =["gov.cn"]
start_urls = [
"http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=120&tableName=TABLE120&title=%CA%B3%C6%B7%C9%FA%B2%FA%D0%ED%BF%C9%BB%F1%D6%A4%C6%F3%D2%B5%28SC%29&bcId=145275419693611287728573704379",
]
def __init__(self):
self.file =codecs.open("webdata" + time.strftime('%Y-%m-%d %X',time.localtime()).replace(':','-')+".json",'w',encoding='utf-8')
self.driver =webdriver.Firefox()
def parse(self, response):
printget_base_url(response)
self.driver.get(response.url)
#获取二级select
all_options =self.driver.find_element_by_id("s20").find_elements_by_tag_name("option")
for option inall_options:
print "Valueis: %s" % option.get_attribute("value")
if(option.get_attribute("value") =="120,145275419693611287728573704379"):
option.click()
#等3秒,让self.driver能够取到内容
# time.sleep(5)
self.get_item()
#以下为取下一页的内容,直到所有页被取完为止
while True:
#获取下一页的按钮点击
tables =self.driver.find_elements_by_xpath("//div[@id='content']/table")
pagedown =(tables[3].find_elements_by_xpath("descendant::img"))[2]
#首先判断按钮是否失效,失效即当前已是最后一页,直接退出
ifpagedown.get_attribute("onclick") == None:
break
else:
pagedown.click()
# time.sleep(5)
self.get_item()
self.driver.close()
def close(self,spider):
self.file.close()
def get_item(self):
WebDriverWait(self.driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//div[@id='content']/table")))
tables =self.driver.find_elements_by_xpath("//div[@id='content']/table")
aaa =tables[1].find_elements_by_xpath("descendant::a")
for a in aaa:
item = EtaoItem()
item['name'] = a.text
contents =a.get_attribute('href').split(",")
item['url'] ="http://app1.sfda.gov.cn/datasearch/face3/"+contents[1]
# printa.text,contents[1]
line =json.dumps(dict(item), ensure_ascii=False) + "\n"
# print line
self.file.write(line)
# yield item
if __name__ == '__main__':
settings =get_project_settings()
process =CrawlerProcess(settings)
process.crawl(ProductSpider)
process.start()
#(1)学习selenium取元素的方法;find_elements_by_xpath()、find_element_by_xpath()及get_attribute();#(2)学习
WebDriverWait(self.driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//div[@id='content']/table")))的使用方法
(3)此方法的缺点是,需要打开浏览器,不适合大规模数据的抓取
第三种方法:scrapyjs
Splash是一个由 Scrapy 鼻祖 ScrapingHub 所创建的Javascript渲染服务。它是一个实现了HTTP API的轻量级浏览器,Splash是用Python实现的,同时使用Twisted和QT。Twisted(QT)用来让服务具有异步处理能力,以发挥webkit的并发能力,能够并行处理多个网页的轻量级网页浏览器,它可以执行自定义 JavaScript 代码并利用关闭图片或广告屏蔽的功能来提升渲染速度。
http://ae.yyuap.com/pages/viewpage.action?pageId=919763
https://docs.docker.com/engine/installation/linux/rhel/
scrapyjs相当于粘合剂的作用,可以将splash整合进scrapy中
Splash安装:
http://splash.readthedocs.org/en/stable/install.html
yum update
curl -fsSL https://get.docker.com/| sh
service docker start
启动splash
docker run -p5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash
(ports 8050 (http), 8051(https) and 5023 (telnet))
docker run -p 8050:8050 scrapinghub/splash
ScrapyJs安装:
pip install scrapyjs
以抓取国家药品食品监督管理网站的食品数据为例进行说明:
classSplashSpider(Spider):
name = "splash_spider"
allowed_domains = ["gov.cn"]
# 'http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40'
start_urls = [
"http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=120&tableName=TABLE120&title=%CA%B3%C6%B7%C9%FA%B2%FA%D0%ED%BF%C9%BB%F1%D6%A4%C6%F3%D2%B5%28SC%29&bcId=145275419693611287728573704379",
]
def start_requests(self):
script = """
function main(splash)
splash:autoload("http://code.jquery.com/jquery-1.4.1.min.js")
splash:autoload([[
functionset_keyword_click(){
$('#keyword').val('山东');
$('#keyword').next().click()
}
function get_content(){
var content ='';
$('#content table tra').length;
$('#content table tra').each(function(){
vararray = $(this).attr('href').split(',');
content +=$(this).text() + array[1] + ' ';
});
return content
}
]])
splash:go("http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=120&tableName=TABLE120&title=%CA%B3%C6%B7%C9%FA%B2%FA%D0%ED%BF%C9%BB%F1%D6%A4%C6%F3%D2%B5%28SC%29&bcId=145275419693611287728573704379")
splash:wait(0.5)
local title =splash:evaljs("document.title")
splash:evaljs("set_keyword_click()")
splash:wait(1)
local content =splash:evaljs("get_content()")
return content
end
"""
meta = {
'splash':{
'endpoint':'execute',
'args':{'lua_source':script}
}
}
for url in self.start_urls:
yield scrapy.Request(url,callback=self.parse,meta=meta)
def parse(self,response):
doc_title =response.body_as_unicode()
print doc_title
if__name__ == '__main__':
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(SplashSpider)
process.start()