该项目全部内容Link
Python信息系统(Scrapy分布式+Django前后端)-1.项目介绍篇
Python信息系统(Scrapy分布式+Django前后端)-2.Scrapy配置篇
Python信息系统(Scrapy分布式+Django前后端)-3.Scrapy抓取篇
Python信息系统(Scrapy分布式+Django前后端)-4.Gerapy爬虫分布式部署
Python信息系统(Scrapy分布式+Django前后端)-5.数据清洗和处理篇
Python信息系统(Scrapy分布式+Django前后端)- 6.Django新闻系统展示篇
Python信息系统(Scrapy分布式+Django前后端)- 7.Django内容后台管理系统配置篇
Python信息系统(Scrapy分布式+Django前后端)- 8.Django内容后台管理系统管理模块篇
Python信息系统(Scrapy分布式+Django前后端)- 9.Django前台Web展示部署
Python信息系统(Scrapy分布式+Django前后端)- 10.Django前台HTML功能
根据上篇文章展现的效果,该项目爬虫目前设计为抓取各大新闻网站的新闻内容,因此需要对抓取的网站进行系统的管理,以便于后期的维护和管理工作。大致思路如下:
scrapy startproject 项目名
scrapy genspider spider名称 "目标网站"
# 根据自己的需求添加抓取的内容
title = scrapy.Field() # 新闻标题
url = scrapy.Field() # 新闻连接
intro = scrapy.Field() # 新闻摘要
thumbImg = scrapy.Field() # 新闻封面图片
author = scrapy.Field() # 新闻作者
publishTime = scrapy.Field() # 新闻网页发布时间
content = scrapy.Field() # 新闻正文
typeId = scrapy.Field() # 新闻类别ID
typeName = scrapy.Field() # 新闻类别名称
py_name = scrapy.Field() # 脚本名称(方便查错用)
# 修改机器人抓取协议
ROBOTSTXT_OBEY = True
# 释放 ITEM_PIPELINES 字典
ITEM_PIPELINES = {
'你的项目名.pipelines.项目名生成的Pipeline': 300,
}
# 添加 设置MONGODB数仓
MONGODB_HOST = "你的IP地址"
MONGODB_PORT = "你的端口号"
MONGODB_DBNAME = "你的数据库名称"
MONGODB_SHEETNAME = "你的新闻存入的表名称"
# 添加 设置浏览器Header
USER_AGENT_LIST = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 添加 DOWNLOADER_MIDDLEWARES # 这里更换项目名
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
'你的项目名.middlewares.RotateUserAgentMiddleware' :400, #更换Header优先级
# '你的项目名.middlewares.MyProxyMidleware': 300, #如果又IP池可以使用没有就注释
}
# 选择添加 JS渲染 有些网页需要
SPLASH_URL = 'http://localhost:8050' # Splash服务器地址
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' #设置去重过滤器
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware':100, #用来支持cache_args(可选)
}
DUPEFILTER_CLASS ='scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE ='scrapy_splash.SplashAwareFSCacheStorage'
# 添加Header和IP池的class
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
class RotateUserAgentMiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
USER_AGENT_LIST = settings.get('USER_AGENT_LIST')
user_agent = random.choice(USER_AGENT_LIST)
if user_agent:
request.headers.setdefault('User-Agent', user_agent)
print(f"User-Agent:{user_agent}")
import random
import sys
sys.path.append('.')
from .my_proxies import PROXY
PROXY_list = PROXY()
class MyProxyMidleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = random.choice(PROXY_list)
# Ip代理 购买的IP都填写在这里就可以了
def PROXY():
PROXY_list = [
"http://111.72.25.213:9999",
"http://60.167.102.236:9999",
"http://218.21.230.156:808",
]
return PROXY_list
# 整个复制即可
import pymongo
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname = settings["MONGODB_SHEETNAME"]
# 创建MONGODB数据库链接
client = pymongo.MongoClient(host=host, port=port)
# 指定数据库
mydb = client[dbname]
# 存放数据的数据库表名
self.post = mydb[sheetname]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item