python+scrapy+selenium爬虫

  1. python配置venv参见之前的步骤,大致需要的环境包括scrapy,selenium,
    如果用到数据库,mongodb相关:需要pymongo,以及在系统中也需要安装mongodb,GUI推荐Robomongo
    或者sqllite相关:需要sqllite3,以及安装一个查看器比如Navicat for SQLite
    用selenium时,可以用selenium驱动firefox,chrome等,也可以用phatomjs
    注意用webdriver.chrome()时需要下载chromedriver.exe,放到对应的python解释器的script目录下,phatomjs也要放到这个目录下

  2. 使用scrapy框架参考Scrapy入门教程

以我按照别人的demo操作为例,首先是不包含selenium的

  1. 创建venv(需要virtualenv, virtualenvwrapper)
mkvirtualenv [-i package] [-r requirements_file] [virtualenv options] ENVNAME

(默认会在C://users/username/Env/中创建一个虚拟环境,之后的相关东西需要copy到里面的script中,也可以直接在pycharm中创建)
后面可以直接用workon venv1进入虚拟环境
2. 创建一个新的Scrapy项目

scrapy startproject TestSpider

3 进入pycharm中打开项目地址,配置好解释器,配置到run的setting,run->Edit Configurations
python+scrapy+selenium爬虫_第1张图片

4.item.py中

import scrapy


class TestspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url = scrapy.Field()
    forum = scrapy.Field()
    poster = scrapy.Field()
    content = scrapy.Field()

spider文件夹中新建一个XXX.py,这里面包含爬虫的主函数

import scrapy
from scrapy.linkextractors import LinkExtractor
from TestSpider.items import TestspiderItem


class BbsSpider(scrapy.Spider):
    name = "bbsSpider"
    allowed_domains = ["bbs.sjtu.edu.cn"]
    start_urls = (
        'https://bbs.sjtu.edu.cn/bbsall',
    )
    link_extractor = {
        'page':  LinkExtractor(allow='bbsdoc,board,\w+\.html$'),
        'page_down': LinkExtractor(allow='bbsdoc,board,\w+,page,\d+\.html$'),
        'content': LinkExtractor(allow='bbscon,board,\w+,file,M\.\d+\.A\.html$'),
    }
    _x_query = {
        'page_content': '//pre/text()[2]',
        'poster': '//pre/a/text()',
        'forum': '//center/text()[2]',
    }

    def parse(self, response):
        for link in self.link_extractor['page'].extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_page)

    def parse_page(self, response):
        for link in self.link_extractor['page_down'].extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_page)

        for link in self.link_extractor['content'].extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_content)

    def parse_content(self, response):
        sel = scrapy.Selector(response)
        items = []
        item = TestspiderItem()
        item['url'] = str(response.url)
        forum = sel.xpath(self._x_query['forum']).extract()
        poster = sel.xpath(self._x_query['poster']).extract()
        page_content = sel.xpath(self._x_query['page_content']).extract()
        item['forum'] = forum
        item['poster'] = poster
        item['content'] = page_content
        items.append(item)
        return items

pipline.py中

from scrapy import signals
from scrapy import log
from TestSpider.items import TestspiderItem
from twisted.enterprise import adbapi
from scrapy.contrib.exporter import XmlItemExporter

# class TestspiderPipeline(object):
#     def process_item(self, item, spider):
#         return item
class XmlWritePipeline(object):
    def __init__(self):
        pass
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
    def spider_opened(self, spider):
        self.file = open('bbsData.xml', 'wb')
        self.expoter = XmlItemExporter(self.file)
        self.expoter.start_exporting()
    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()
        # process the crawled data, define and call dataProcess function
        # dataProcess('bbsData.xml', 'text.txt')
    def process_item(self, item, spider):
        self.expoter.export_item(item)
        return item

setting.py中
注意修改

ITEM_PIPELINES = {
   'TestSpider.pipelines.XmlWritePipeline': 300,
}

其他的设置如:

BOT_NAME = 'TestSpider'

SPIDER_MODULES = ['TestSpider.spiders']
NEWSPIDER_MODULE = 'TestSpider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'TestSpider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

如此,一个静态网页的爬虫demo就ok了,然后是介绍selenium使用。

可以参考youtube链接,需要梯子
可以先用firefox下载selenium ide,用这个插件捕获操作步骤,可以很方便的查看一些示例。
一个很简单的demo:

from selenium import webdriver

base_url = "https://www.baidu.com/"
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(base_url)

driver.find_element_by_id("kw").click()
driver.find_element_by_id("kw").send_keys("123")
driver.find_element_by_id("su").click()

注意需要chromedriver.exe,放到script文件夹中。一开始是用firefox的,firefox不需要driver.exe,但是据说是因为版本问题,无法正常操作。

————————————

一个用selenium登录知乎并且输出一些首页的标题的demo如下:

主要用到了selenium进行登录验证,用cookies来给scrapy,然后用自带的xpath解析网页,其实似乎应该selenium登录解析放在中间件里的,不过这里先完成功能而已吧

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver



class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["zhihu.com"]
    start_urls = (
        'https://www.zhihu.com/',
    )



    def get_cookies(self):
        driver = webdriver.Chrome()
        driver.get(self.start_urls[0])
        driver.find_element_by_link_text(u"登录").click()
        driver.find_element_by_name("account").clear()
        driver.find_element_by_name("account").send_keys("your username") #修改为自己的用户名
        driver.find_element_by_name("password").clear()
        driver.find_element_by_name("password").send_keys("keys") #修改为自己的密码
        SignInURL = u"https://www.zhihu.com/#signin"
        try:
            if driver.find_element_by_id('captcha'):
                while True:
                    if not SignInURL == driver.current_url:
                        break
                    pass
                pass
        finally:
            if SignInURL == driver.current_url:
                driver.find_element_by_css_selector("button.sign-button.submit").click()
            cookies = driver.get_cookies()
            driver.close()
            print cookies
            return cookies

    def after_login(self, response):
        sel = scrapy.Selector(response)
        #print response.body
        for i in range(1,10):
            xml = r'//*[@id="feed-%d"]/div[1]/div[2]/div[2]/h2/a/text()' %(i)
            titile = sel.xpath(xml).extract()
            if len(titile):
                print str(titile[0])

    def parse(self, response):
        return scrapy.Request(url=self.start_urls[0], cookies=self.get_cookies(), callback=self.after_login)

因为知乎的反爬虫策略,需要user agent ,在setting.py中加入

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'

之后可以加入一个user agent池

你可能感兴趣的:(python,杂)