Python 爬虫

工具

下载器:urllib3
网页解析器:html.parser、lxml
网页解析库:beautifulsoup4
爬虫框架:scrapy

urllib3 + beautifulsoup4

import urllib3
import bs4

if __name__ == '__main__':   # 本模块是入口模块时,才进入
    http = urllib3.PoolManager(num_pools=5, headers={})
    response = http.request('GET', 'http://www.baidu.com')    # 请求
    print(response.data.decode('unicode_escape'))                # 读取内容

    soup = bs4.BeautifulSoup(response.data, 'html.parser', from_encoding='utf-8')
    node = soup.find('a')    # 查找结点
    print(node.name)         # 读取节点

scrapy

安装、启动
pip install scrapy # 配置好Path
scrapy startproject douban_provider # 创建项目
scrapy genspider douban_provider movie.douban.com # 在spiders目录下执行,生成spider文件,指定入口域名

运行
scrapy crawl douban_provider # 运行spider
scrapy crawl douban_provider -o test.json # 运行并输出到json
scrapy crawl douban_provider -o test.csv # 运行并输出到csv
main.py

from scrapy import cmdline

cmdline.execute('scrapy crawl douban_provider'.split())

运行main.py 等同于scrapy crawl douban_provider

目录结构
scrapy.cfg # 项目配置
settings.py # 爬虫配置,设置USER_AGENT、SPIDER_MIDDLEWARES、ITEM_PIPELINES等
items.py # 目标数据项 及其 字段配置
pipelines.py # 调度器,处理得到的数据 进入下一次爬取
middlewares.py # 中间件,设置随机IP代理、随机user_agent等

items.py

import scrapy

class StockItem(scrapy.Item):
    stock_id = scrapy.Field()             # 定义目标字段
    stock_name = scrapy.Field()
    establish = scrapy.Field()
    publish = scrapy.Field()

spider.py

import scrapy
from stock.items import StockItem
import re

class StockSpider(scrapy.Spider):
    name = 'stock_spider'
    allowed_domains = ['quote.eastmoney.com', 'www.qcc.com']
    start_urls = ['http://quote.eastmoney.com/stock_list.html']

    def parse(self, response):
        # print(response.text)    # response 是一个 lxml.etree
        stock_list = response.xpath("//div[@class='quotebody']//li")   # 用xpath表达式查询
        for item in stock_list:
            stock = item.xpath(".//a[re:match(text(),'(\(000)|(\(600)|(\(601)|(\(603)')]/text()").extract_first()
            if stock:
                result = re.search(r'(.*)\((.*)\)', stock)     # 使用正则表达式
                stock_item = StockItem()
                stock_item['stock_name'] = result.group(1)
                stock_item['stock_id'] = result.group(2)
                yield scrapy.Request('https://www.qcc.com/search?key=' + stock_item['stock_name'],
                                     meta={'stock_item': stock_item}, callback=self.parse_result)  # 进入下一步抓取,并传递 item

    def parse_result(self, response):
        stock_item = response.meta['stock_item']    # 得到上一步传来的item
        company_href = response.xpath("//table[@class='m_srchList']//tr//a/@href").extract_first()
        if company_href:
            yield scrapy.Request('https://www.qcc.com' + company_href, meta={'stock_item': stock_item},
                                 callback=self.parse_company)
        else:
            yield stock_item    # 提交给 pipeline

    def parse_company(self, response):
        stock_item = response.meta['stock_item']
        publish_info = response.xpath("//section[@id='ipoPublish']")
        if publish_info:
            stock_item['establish'] = publish_info.xpath(".//tr[1]//td[2]/text()").extract_first()
            stock_item['publish'] = publish_info.xpath(".//tr[1]//td[4]/text()").extract_first()
            yield stock_item
        else:
            yield stock_item

爬虫攻防

1、IP锁定 --- 随机IP,例如使用阿布云的代理(HTTP隧道)
2、user_agent锁定 --- 随机user_agent
2、验证码
3、会话控制

你可能感兴趣的:(Python 爬虫)