工具
下载器:urllib3
网页解析器:html.parser、lxml
网页解析库:beautifulsoup4
爬虫框架:scrapy
urllib3 + beautifulsoup4
import urllib3
import bs4
if __name__ == '__main__': # 本模块是入口模块时,才进入
http = urllib3.PoolManager(num_pools=5, headers={})
response = http.request('GET', 'http://www.baidu.com') # 请求
print(response.data.decode('unicode_escape')) # 读取内容
soup = bs4.BeautifulSoup(response.data, 'html.parser', from_encoding='utf-8')
node = soup.find('a') # 查找结点
print(node.name) # 读取节点
scrapy
安装、启动
pip install scrapy # 配置好Path
scrapy startproject douban_provider # 创建项目
scrapy genspider douban_provider movie.douban.com # 在spiders目录下执行,生成spider文件,指定入口域名
运行
scrapy crawl douban_provider # 运行spider
scrapy crawl douban_provider -o test.json # 运行并输出到json
scrapy crawl douban_provider -o test.csv # 运行并输出到csv
main.py
from scrapy import cmdline
cmdline.execute('scrapy crawl douban_provider'.split())
运行main.py 等同于scrapy crawl douban_provider
目录结构
scrapy.cfg # 项目配置
settings.py # 爬虫配置,设置USER_AGENT、SPIDER_MIDDLEWARES、ITEM_PIPELINES等
items.py # 目标数据项 及其 字段配置
pipelines.py # 调度器,处理得到的数据 进入下一次爬取
middlewares.py # 中间件,设置随机IP代理、随机user_agent等
items.py
import scrapy
class StockItem(scrapy.Item):
stock_id = scrapy.Field() # 定义目标字段
stock_name = scrapy.Field()
establish = scrapy.Field()
publish = scrapy.Field()
spider.py
import scrapy
from stock.items import StockItem
import re
class StockSpider(scrapy.Spider):
name = 'stock_spider'
allowed_domains = ['quote.eastmoney.com', 'www.qcc.com']
start_urls = ['http://quote.eastmoney.com/stock_list.html']
def parse(self, response):
# print(response.text) # response 是一个 lxml.etree
stock_list = response.xpath("//div[@class='quotebody']//li") # 用xpath表达式查询
for item in stock_list:
stock = item.xpath(".//a[re:match(text(),'(\(000)|(\(600)|(\(601)|(\(603)')]/text()").extract_first()
if stock:
result = re.search(r'(.*)\((.*)\)', stock) # 使用正则表达式
stock_item = StockItem()
stock_item['stock_name'] = result.group(1)
stock_item['stock_id'] = result.group(2)
yield scrapy.Request('https://www.qcc.com/search?key=' + stock_item['stock_name'],
meta={'stock_item': stock_item}, callback=self.parse_result) # 进入下一步抓取,并传递 item
def parse_result(self, response):
stock_item = response.meta['stock_item'] # 得到上一步传来的item
company_href = response.xpath("//table[@class='m_srchList']//tr//a/@href").extract_first()
if company_href:
yield scrapy.Request('https://www.qcc.com' + company_href, meta={'stock_item': stock_item},
callback=self.parse_company)
else:
yield stock_item # 提交给 pipeline
def parse_company(self, response):
stock_item = response.meta['stock_item']
publish_info = response.xpath("//section[@id='ipoPublish']")
if publish_info:
stock_item['establish'] = publish_info.xpath(".//tr[1]//td[2]/text()").extract_first()
stock_item['publish'] = publish_info.xpath(".//tr[1]//td[4]/text()").extract_first()
yield stock_item
else:
yield stock_item
爬虫攻防
1、IP锁定 --- 随机IP,例如使用阿布云的代理(HTTP隧道)
2、user_agent锁定 --- 随机user_agent
2、验证码
3、会话控制