【Scrapy 五分钟撸网站】挑战全网爬虫的爬虫,全部文章目录索引
全部内容采用Scrapy框架,文章有标注网站全套的数据抓取教程以及经验指导,只要我有力气每周都会更新的爬虫,粉丝可以留言定制各种网站的爬虫脚本。
中国煤炭新闻网 是煤,煤炭,煤矿,煤炭价格,煤炭市场,煤炭运输,中国煤炭新闻网,煤炭新闻,煤炭网,二手设备,煤炭供求,煤矿机电设备,煤矿新闻,煤矿人才,煤业技术,校友录,煤价旬报,技术…
1. 不了解5分钟快速抓网站思路的小伙伴先看
【Scrapy 五分钟撸网站】全站数据必备基础知识
2. 不了解数据抓取业务管理整理小伙伴先看
【Scrapy 五分钟撸网站】爬虫目标整理和数据准备
3. 不了解Scrapy模板量产的小伙伴先看(必看)
【Scrapy 五分钟撸网站】数据抓取项目框架通用模板
1. 创建spider项目
scrapy genspider www_cwestc_com " "
2. 整理全站css样式
先来看下页面的CSS样式,全站统一两种样式。
3. 修改www_cwestc_com.py的的内容
这里将需要修改的地方进行说明,其他地方参考模板,不需修改。
allowed_domains = []
web_name = "中国煤炭新闻网"
start_menu = [
# 主站
[
{
"channel_name": "煤炭新闻", "url": "http://www.cwestc.com/MroeNews.aspx", },
{
"channel_name": "政策法规-2008年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=17&id=4", },
{
"channel_name": "政策法规-2007年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=18&id=4", },
{
"channel_name": "政策法规-2006年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=19&id=4", },
{
"channel_name": "政策法规-2005年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=20&id=4", },
{
"channel_name": "政策法规-2004年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=21&id=4", },
{
"channel_name": "政策法规-2003年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=22&id=4", },
{
"channel_name": "政策法规-2002年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=23&id=4", },
{
"channel_name": "政策法规-2001年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=24&id=4", },
{
"channel_name": "政策法规-2000年政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=25&id=4", },
{
"channel_name": "政策法规-98-99年政策法规",
"url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=26&id=4", },
{
"channel_name": "政策法规-97年前的政策法规", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=27&id=4", },
{
"channel_name": "新闻写作", "url": "http://www.cwestc.com/MroeNews.aspx?gd=35", },
{
"channel_name": "技术论文-开采方法", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=1&id=1", },
{
"channel_name": "技术论文-通风和安全", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=2&id=1", },
{
"channel_name": "技术论文-2006年煤业技术", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=6&id=1", },
{
"channel_name": "技术论文-开拓与掘进", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=7&id=1", },
{
"channel_name": "技术论文-地测与资环", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=8&id=1", },
{
"channel_name": "技术论文-矿山机械", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=9&id=1", },
{
"channel_name": "技术论文-洗选与综合利用", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=10&id=1", },
{
"channel_name": "技术论文-矿山电工", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=11&id=1", },
{
"channel_name": "技术论文-经济管理", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=14&id=1", },
{
"channel_name": "技术论文-信息与新技术", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=15&id=1", },
{
"channel_name": "技术论文-其他", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=16&id=1", },
{
"channel_name": "矿山安全-安全救护信息", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=28&id=3", },
{
"channel_name": "矿山安全-事故处理分析", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=29&id=3", },
{
"channel_name": "矿山安全-煤矿安全标准", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=30&id=3", },
{
"channel_name": "事故案例-顶板事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=31&id=2", },
{
"channel_name": "事故案例-瓦斯事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=32&id=2", },
{
"channel_name": "事故案例-运输事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=33&id=2", },
{
"channel_name": "事故案例-机电事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=34&id=2", },
{
"channel_name": "事故案例-放炮事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=35&id=2", },
{
"channel_name": "事故案例-水害事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=36&id=2", },
{
"channel_name": "事故案例-其他事故", "url": "http://www.cwestc.com/ShowAllContentMian.aspx?sendid=37&id=2", },
{
"channel_name": "煤市分析", "url": "http://www.cwestc.com/MroeNews.aspx?gd=33", },
{
"channel_name": "煤价行情", "url": "http://www.cwestc.com/MroeNews.aspx?gd=44", },
],
# 华北频道
[
{
"channel_name": "华北频道-每日头条", "url": "http://huabei.cwestc.com/news/3.html", },
{
"channel_name": "华北频道-企业风采", "url": "http://huabei.cwestc.com/news/5.html", },
{
"channel_name": "华北频道-行业动态", "url": "http://huabei.cwestc.com/news/6.html", },
{
"channel_name": "华北频道-局矿快报 ", "url": "http://huabei.cwestc.com/news/4.html", },
{
"channel_name": "华北频道-矿山文学", "url": "http://huabei.cwestc.com/news/9.html", },
{
"channel_name": "华北频道-企业镜像", "url": "http://huabei.cwestc.com/news/8.html", },
{
"channel_name": "华北频道-人物专访", "url": "http://huabei.cwestc.com/news/11.html", },
{
"channel_name": "华北频道-行业先锋", "url": "http://huabei.cwestc.com/news/10.html", },
{
"channel_name": "华北频道-党群工作", "url": "http://huabei.cwestc.com/news/7.html", },
],
# 西北频道
[
{
"channel_name": "西北频道-每日头条", "url": "http://sx.cwestc.com/news/3.html", },
{
"channel_name": "西北频道-企业风采", "url": "http://sx.cwestc.com/news/5.html", },
{
"channel_name": "西北频道-行业动态", "url": "http://sx.cwestc.com/news/6.html", },
{
"channel_name": "西北频道-局矿快报 ", "url": "http://sx.cwestc.com/news/4.html", },
{
"channel_name": "西北频道-矿山文学", "url": "http://sx.cwestc.com/news/9.html", },
{
"channel_name": "西北频道-企业镜像", "url": "http://sx.cwestc.com/news/8.html", },
{
"channel_name": "西北频道-人物专访", "url": "http://sx.cwestc.com/news/11.html", },
{
"channel_name": "西北频道-行业先锋", "url": "http://sx.cwestc.com/news/10.html", },
{
"channel_name": "西北频道-党群工作", "url": "http://sx.cwestc.com/news/7.html", },
],
# 华中频道
[
{
"channel_name": "华中频道-每日头条", "url": "http://huazhong.cwestc.com/news/3.html", },
{
"channel_name": "华中频道-企业风采", "url": "http://huazhong.cwestc.com/news/5.html", },
{
"channel_name": "华中频道-行业动态", "url": "http://huazhong.cwestc.com/news/6.html", },
{
"channel_name": "华中频道-局矿快报 ", "url": "http://huazhong.cwestc.com/news/4.html", },
{
"channel_name": "华中频道-矿山文学", "url": "http://huazhong.cwestc.com/news/9.html", },
{
"channel_name": "华中频道-企业镜像", "url": "http://huazhong.cwestc.com/news/8.html", },
{
"channel_name": "华中频道-人物专访", "url": "http://huazhong.cwestc.com/news/11.html", },
{
"channel_name": "华中频道-行业先锋", "url": "http://huazhong.cwestc.com/news/10.html", },
{
"channel_name": "华中频道-党群工作", "url": "http://huazhong.cwestc.com/news/7.html", },
],
# 东北频道
[
{
"channel_name": "东北频道-每日头条", "url": "http://dongbei.cwestc.com/news/3.html", },
{
"channel_name": "东北频道-企业风采", "url": "http://dongbei.cwestc.com/news/5.html", },
{
"channel_name": "东北频道-行业动态", "url": "http://dongbei.cwestc.com/news/6.html", },
{
"channel_name": "东北频道-局矿快报 ", "url": "http://dongbei.cwestc.com/news/4.html", },
{
"channel_name": "东北频道-矿山文学", "url": "http://dongbei.cwestc.com/news/9.html", },
{
"channel_name": "东北频道-企业镜像", "url": "http://dongbei.cwestc.com/news/8.html", },
{
"channel_name": "东北频道-人物专访", "url": "http://dongbei.cwestc.com/news/11.html", },
{
"channel_name": "东北频道-行业先锋", "url": "http://dongbei.cwestc.com/news/10.html", },
{
"channel_name": "东北频道-党群工作", "url": "http://dongbei.cwestc.com/news/7.html", },
],
]
整体网站数据列表有多少种样式就要做多少个parseX,并添加到
parse_list = [
self.parse1, # 主站
self.parse2, # 华北频道
self.parse2, # 西北频道
self.parse2, # 华中频道
self.parse2, # 东北频道
]
# 主站样式 列表内容抓取
Item_title = response.xpath('//td[@align="left"]/b/strong/a/text()').extract() # 文章标题列表
Item_url = response.xpath('//td[@align="left"]/b/strong/a/@href').extract() # 文章链接列表
# 其他分站样式 列表内容抓取
Item_title = response.xpath('//ul[@class="n-list"]/li/h2/a/text()').extract() # 文章标题列表
Item_url = response.xpath('//ul[@class="n-list"]/li/h2/a/@href').extract() # 文章链接列表
# 处理详情页带格式,这里整个页面进行抓取
item['content'] = ""
if 'class="newsContent"' in response.text and len(None2Str(item['content'])) < 5:
item['content'] = response.xpath('//td[@class="newsContent"]').extract_first()
if 'class="content"' in response.text and len(None2Str(item['content'])) < 5:
item['content'] = response.xpath('//div[@class="content"]').extract_first()
if 'class="entry"' in response.text and len(None2Str(item['content'])) < 5:
item['content'] = response.xpath('//div[@class="entry"]').extract_first()