- scrapy框架爬取今日头条数据,主要实现一下几个主要功能:
- 数据存储到mongodb数据库
- 图片下载
- 随机切换User-Agent
- 对接IP代理池
- 实现邮件发送
1.首先按F12打开开发者工具,如图:
- 由于今日头条的数据是js动态加载,我们需要找到加载数据的接口,进攻分析,数据接口链接如上图所示。
-
首先查看Headers信息:
我们发现query string parameters 中有as和cp这两个参数,这是今日头条一种反扒措施,我们需要编写算法来生成这两个参数(网上找的)
-
我们再看response也就是返回的数据信息:
我们将数据拷贝到在线的json格式转换器中,方便我们对数据进行分析:
- 上图我们可以看到,返回的数据包含来新闻的全部信息,其中在最后还有一个max_behot_time参数,通过改变这个参数信息,我们可以实现新闻数据循环抓取
- spider代码如下:
# -*- coding: utf-8 -*-
import scrapy
import json
import time
import hashlib
import random
import requests
from datetime import datetime
from ..emailsend import EmailSend
from toutiao_two.items import ToutiaoTwoItem
class ToutiaoSpiderSpider(scrapy.Spider):
name = 'toutiao_spider'
allowed_domains = ['www.toutiao.com']
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Host': 'www.toutiao.com',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
cookies = {'tt_webid': '6722356446824613389'}
start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
max_behot_time = '0'
D = {'hot_time': '0'}
def get_as_cp(self): # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
zz = {}
now = round(time.time())
print(now) # 获取当前计算机时间
e = hex(int(now)).upper()[2:] # hex()转换一个整数对象为16进制的字符串表示
print('e:', e)
a = hashlib.md5() # hashlib.md5().hexdigest()创建hash对象并返回16进制结果
print('a:', a)
a.update(str(int(now)).encode('utf-8'))
i = a.hexdigest().upper()
print('i:', i)
if len(e) != 8:
zz = {'as': '479BB4B7254C150',
'cp': '7E0AC8874BB0985'}
return zz
n = i[:5]
a = i[-5:]
r = ''
s = ''
for i in range(5):
s = s + n[i] + e[i]
for j in range(5):
r = r + e[j + 3] + a[j]
zz = {
'as': 'A1' + s + e[-3:],
'cp': e[0:3] + r + 'E1'
}
print('zz:', zz)
return zz
def start_requests(self):
global start_time
start_time = datetime.now()
ascp = self.get_as_cp()
yield scrapy.FormRequest(url=self.start_url + self.max_behot_time + '&max_behot_time_tmp=' + self.max_behot_time + '&tadrequire=true&as=' + ascp[
'as'] + '&cp=' + ascp['cp'],
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse,
)
def parse(self, response):
json_result = json.loads(response.text)
# if json_result is None:
# print(self.D['hot_time'], '=====')
# time.sleep(20)
# yield scrapy.FormRequest(
# url=self.start_url + self.D['hot_time'] + '&max_behot_time_tmp=' + self.D['hot_time'] + '&tadrequire=true&as=' +
# 'A115DD5DE72AC29' + '&cp=' + '5DD7FA9C02D90E1',
# method='GET',
# headers=self.headers,
# cookies=self.cookies,
# callback=self.parse,
# )
item = ToutiaoTwoItem()
infos = json_result['data']
for info in infos:
image_url_list = []
item['abstract'] = info['abstract'] if info.get('abstract') else ''
item['chinese_tag'] = info['chinese_tag'] if info.get('chinese_tag') else ''
item['title'] = info['title'] if info.get('title') else ''
item['source'] = info['source'] if info.get('source') else ''
image_urls = info['image_list'] if info.get('image_list') else ''
for image_url in image_urls:
url = 'https:' + image_url['url']
image_url_list.append(url)
item['image_url'] = image_url_list
yield item
time.sleep(random.randint(1, 4))
print(self.D['hot_time'])
if json_result.get('next'):
next = json_result['next']
if next.get('max_behot_time'):
max_behot_time = str(json_result['next']['max_behot_time'])
self.D.update({'hot_time': max_behot_time})
ascp = self.get_as_cp()
yield scrapy.FormRequest(
url=self.start_url + max_behot_time + '&max_behot_time_tmp=' + max_behot_time + '&tadrequire=true&as=' +
str(ascp['as']) + '&cp=' + str(ascp['cp']),
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse,
)
def closed(self, reason):
# 爬虫关闭的时候,会调用这个方法
email = EmailSend()
# 爬虫耗时
use_time = datetime.now() - start_time
close_time = 'toutiao爬虫开始时间{};结束时间:{};爬虫耗时:{}'.format(start_time, datetime.now(), use_time)
content = '爬虫关闭原因:{}'.format(reason)
email.send_text_email('发送者邮箱@qq.com', '接受者邮箱@qq.com', close_time, content)
实现IP代理池对接,在middlewares.py中实现,增加如下代码,并在settings中配置:
# 对接IP代理池
class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self, request, spider):
if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)
- setteings
# 抛出可用IP地址
PROXY_URL = 'http://localhost:5555/random'
设置随机切换User-Agent,同样在middlewares.py中实现,并在settings配置:
# 随机切换User-Agent
class RandomUserAgent(object):
"""Randomly rotate user agents based on a list of predefined ones"""
def __init__(self, agents):
self.agents = agents
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings.getlist('USER_AGENTS'))
def process_request(self, request, spider):
#print "**************************" + random.choice(self.agents)
request.headers.setdefault('User-Agent', random.choice(self.agents))
- settings
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
DOWNLOADER_MIDDLEWARES = {
'toutiao_two.middlewares.RandomUserAgent': 543,
'toutiao_two.middlewares.ProxyMiddleware': 550,
}
在pipipelines中实现图片下载
import copy
import pymongo
from pymongo import MongoClient
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
# 下载图片
class ToutaioImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1] + '.jpg'
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image DownloadedFailed')
return item
def get_media_requests(self, item, info):
for image_url in item['image_url']:
yield Request(image_url)
- settings中配置图片存储路径:
IMAGES_STORE = './images'
- 同样在pipipelines中实现数据存储
# mongodb
class ToutiaoTwoMongoPipeline():
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
"""
问题:数据存入数据库之后,出现大量重复数据
解决思路:
在process_item中执行数据插入之前,先对变量进行复制copy,再用复制copy的变量进行操作,通过互斥确保变量不被修改。因此,修正这个问题,我们只需要调整优化下process_item()方法。
解决代码:process_item() - copy.deepcopy(item) ->导入copy包
"""
asynItem = copy.deepcopy(item)
infos = {'chinese_tag': asynItem['chinese_tag'], 'title': asynItem['title'], 'source': asynItem['source'],
'image_url': asynItem['image_url']}
self.db.toutiao.insert(infos)
return item
- settings中配置数据库链接信息:
ITEM_PIPELINES = {
'toutiao_two.pipelines.ToutiaoTwoMongoPipeline': 300,
'toutiao_two.pipelines.ToutaioImagePipeline': 300,
}
MONGO_URI = 'localhost'
MONGO_DB = 'scrapy_toutiao'
实现邮件发送,在settings同级目录下新增emailsend.py文件,写入如下内容:
#-*- coding: utf-8 -*-
'''
---------------
Description of this file
:author: Luopeng
:date created: 2019-12-04
:python version: 3.6
---------------
'''
import smtplib
from email.mime.text import MIMEText
import logging
class EmailSend(object):
def __init__(self):
self.logging = logging.getLogger('Waring')
self.email_host = 'smtp.qq.com'
self.email_port = '465'
self.email_pass = '*********' # 自己的授权码
def send_text_email(self, from_addr, to_addrs, subject, content):
self.logging.warning('send_text_email is willed 丢弃')
self.logging.error('send_text_email is None')
message_text = MIMEText(content, 'plain', 'utf8')
message_text['From'] = from_addr
message_text['To'] = to_addrs
message_text['Subject'] = subject
try:
# 在创建客户端对象的同时,连接到邮箱服务器。
client = smtplib.SMTP_SSL(host=self.email_host, port=self.email_port)
login_result = client.login(from_addr, self.email_pass)
if login_result and login_result[0] == 235:
print('登录成功')
client.sendmail(from_addr, to_addrs, message_text.as_string())
print('邮件发送成功')
else:
print('邮件发送异常:', login_result[0], login_result[1])
except Exception as e:
# print('连接邮箱服务器异常:',e)
self.logging.error('连接邮箱服务器异常:{}'.format(e))
def send_image_email(self):
pass
def send_word_email(self):
pass
def send_video_email(self):
pass
- 具体调用请参考spider里面的代码