python scrapy selenium phantomJS爬取动态网页

之前用selenium和phantomJS单线程爬取tyc的对外投资信息,无奈爬取速度太慢,单个企业抓取速度大概在>30-60s,这还不是最关键的,最令人崩溃的是刚抓取一会就有bug,导致程序中断,程序中断的原因大概在爬取程序卡在某个部分不动了,经检查也没发现bug在哪,所以爬虫一直处于手动爬虫-手动中断-继续爬虫的状态。今天学了scrapy,果断用scrapy+selenium+phantomJS来爬。
先上代码

#coding:utf-8
from selenium.webdriver.common.keys import Keys  
import time
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymongo
import xlrd
import time 

import scrapy
from tyc.items import TycItem
import logging
from scrapy.http import Request


class TycSpider(scrapy.Spider):
    name = 'tyc'
    allowed_domains = ['tianyancha.com']
    fname = "C:\\Users\\Administrator\\Desktop\\test.xlsx"
    workbook = xlrd.open_workbook(fname)
    sheet = workbook.sheet_by_name('Sheet1')
    urls = list()
    cols = sheet.col_values(0)
    #要爬取的url
    start_urls =['http://www.tianyancha.com/search?key={}&checkFrom=searchBox' .format(col) for col in cols]     

    def parse(self,response):
        #用phantomJs模拟浏览器,添加headers
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36"
        )
        browser = webdriver.PhantomJS(desired_capabilities=dcap)
        browser.get(response.url)
        time.sleep(4)
        #获取企业url
        try:
            url = browser.find_element_by_class_name('query_name').get_attribute('href')
            browser.quit()
            self.logger.info('成功搜索到 %s',url)
            yield Request(url = url,callback = self.parse_detail)
            
        except Exception as e:
            self.logger.info('经查询没有这个企业!')
    

    def parse_detail(self,response):
        #获取企业对外投资情况
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36"
        )
        browser = webdriver.PhantomJS(desired_capabilities=dcap)
     
        browser.get(response.url)
        self.logger.info('url %s', response.url)
        time.sleep(3)
        soup = BeautifulSoup(browser.page_source, 'lxml')
        # driver.implicitly_wait(10)
        browser.quit()
        item = TycItem()
        
        name = soup.select('.base-company')[0].text.split(' ')[0]
        self.logger.info('企业名 %s',name)
        try:
            inv = soup.select('#nav-main-outInvestment .m-plele')
            print (len(inv))
            for i in inv:
                inv = i.select('div')
                companyName = inv[0].text
                legalPerson = inv[2].text
                industry = inv[3].text
                state = inv[4].text
                invest = inv[5].text
                item['company'] = name
                item['enterprise_name'] = companyName
                item['legal_person_name'] = legalPerson
                item['industry'] = industry
                item['status'] = state
                item['reg_captial'] = invest
                
                yield (item)
        except Exception as e:
            self.logger.info('这个企业没有对外投资!') 
有几处需要注意:
  • 虽然用selenium模拟浏览器了,但是仍然要添加headers,不添加headers,网页的代码还是不全。
  • 现在速度是有些提升了,不过面对海量的数据,还是要利用分布式爬虫scrapy-redis或者scrapyd。
后续继续学习scrapy分布式......

你可能感兴趣的:(python scrapy selenium phantomJS爬取动态网页)