scrapy多级页面抓取

文章转载:https://blog.csdn.net/qq_32942549/article/details/79663855
今天实战为大家介绍多级页面的抓取,以车质网投诉为例

1.准备工作

首先明确要抓取的字段,包括投诉编码,投诉品牌,车型等和投诉简述里面的日期,详细投诉内容和回复,分为两个页面。

通过查看典型问题是通过JS动态加载的,这里我们先处理一下,转成字典格式,是通过编码识别来判断的

调整后的数据是这样的,保存跟item同级目录下命名为chezhi.py

2.项目开始

创建一个项目

scrapy startproject chezhi
明确要抓取的字段

import scrapy

class ChezhiItem(scrapy.Item):
id = scrapy.Field()
brand = scrapy.Field()
car = scrapy.Field()
car_style = scrapy.Field()
content = scrapy.Field()
question = scrapy.Field()
state = scrapy.Field()
time = scrapy.Field()
detail_content = scrapy.Field()
Reply = scrapy.Field()
编写spider代码

-- coding: utf-8 --

import scrapy
from chezhi.items import ChezhiItem
from chezhi.chezhi import listcode

class TousuSpider(scrapy.Spider):
name = ‘tousu’
allowed_domains = [‘www.12365auto.com’]

def start_requests(self):
    url='http://www.12365auto.com/zlts/0-1878-0-0-0-0_0-0-{0}.shtml'
    for i in range(1,51):
        yield scrapy.Request(url.format(i),callback=self.pare)

def pare(self,response):

    trs=response.xpath('//table//tr')
    # trs=article_list.xpath("tr")
    # items=[]
    # print(listcode)
    # list1=[]
    for i in trs[1:]:
        items=ChezhiItem()
        items['id']=i.xpath("td[1]/text()")[0].extract()
        items['brand']=i.xpath("td[2]/text()")[0].extract()
        items['car']=i.xpath("td[3]/text()")[0].extract()
        items['car_style']=i.xpath("td[4]/text()")[0].extract()
        items['content']=i.xpath("td[5]/a/text()")[0].extract()
        url=i.xpath("td[5]/a/@href")[0].extract()
        question_list=i.xpath("td[6]//text()")[0].extract()
        codes=question_list.split(',')
        # print(article['id'],codes,[listcode.get(j) for j in codes][:-1])
        items['question']=str([listcode.get(j) for j in codes][:-1])
        items['state']=i.xpath("td[8]/em/text()")[0].extract()
        # items.append(article)
        # print(url)
        # print (items,'------------')
        # yield items
        yield scrapy.Request(url=url,meta={'items':items},callback=self.pare_detail,dont_filter=True)

def pare_detail(self,response):
    items=response.meta['items']
    items['time']=response.xpath('//div[@class="jbqk"]/ul/li[5]/text()')[0].extract()
    items['detail_content']=response.xpath('//div[@class="tsnr"]/p/text()')[0].extract()
    items['Reply']=response.xpath('//div[@class="tshf"]/p/text()')[0].extract()
    # print(items)
    yield items

pipelines

import pymysql
import pymysql.cursors
class ChezhiPipeline(object):
def process_item(self, item, spider):
DBKWARGS=spider.settings.get(‘DBKWARGS’)
con=pymysql.connect(**DBKWARGS)#链接数据库
cur=con.cursor()游标
sql=(‘insert into chezhi values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)’)
lis=(item[‘id’],item[‘brand’],item[‘car’],item[‘car_style’],item[‘content’],
item[‘question’],item[‘state’],item[‘time’],item[‘detail_content’],item[‘Reply’])
try:
cur.execute(sql,lis)#插入数据
except Exception as e:
print(“insert err:”,e)
con.rollback()#回滚
else:
con.commit()
cur.close()#关闭游标
con.close()#关闭链接
return item
settings文件编写

BOT_NAME = ‘chezhi’

SPIDER_MODULES = [‘chezhi.spiders’]
NEWSPIDER_MODULE = ‘chezhi.spiders’
DBKWARGS={‘db’:’test’,”user”:”root”,”passwd”:”12345”,”host”:”localhost”,”use_unicode”:True,”charset”:”utf8”}

USER_AGENTS = [
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36”,
]

DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8’,
‘Accept-Encoding’: ‘gzip, deflate, sdch’,
‘Accept-Language’: ‘zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4’,
‘Connection’: ‘keep-alive’,
‘Cookie’: ‘uuid=384279260814621; UM_distinctid=1624818bb6a404-0fe2919d52c83c-546d3974-100200-1624818bb9730a; uuid=384279260814621; ASP.NET_SessionId=eyhvs1zwzzbeonr05b41o455; CNZZDATA2115974=cnzz_eid%3D1359293862-1521625094-null%26ntime%3D1521684499; Hm_lvt_d08ac25fd4c45e85e8d9f1c32e97a0eb=1521628003,1521684855; Hm_lpvt_d08ac25fd4c45e85e8d9f1c32e97a0eb=1521689053’,
‘Cache-Control’: ‘max-age=0’,
‘DNT’: ‘1’,
‘Host’: ‘www.12365auto.com’,
‘Referer’: ‘http://www.12365auto.com/zlts/0-1878-0-0-0-0_0-0-1.shtml‘,
‘Upgrade-Insecure-Requests’: ‘1’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36’
}

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = ‘chezhi (+http://www.yourdomain.com)’

Obey robots.txt rules

ROBOTSTXT_OBEY = False
到这里就可以运行啦,有兴趣的朋友可以在优化一下。

查看下抓取到的数据

你可能感兴趣的:(scrapy多级页面抓取)