spiders框架——post 和 页面的二次跳转

# -*- coding:utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from p1.items import P1Item
import json
import lxml
class XiaoHuarSpider(scrapy.Spider):
    name = "keche"
    def start_requests(self):
        #第一个url请求,定义函数 start_requests
        # if post 请求  yield scrapy.FormRequest(url=url, headers=headerscallback=默认的是parse函数)
        #if get 请求     yield scrapy.Request(url=a7, headers=headers,callback=self.响应页要传的函数名
        for i in range(1,2):
            url = '****'+str(i)
            yield scrapy.FormRequest(url=url,
                                     headers=headers)
    def parse(self, response): # 对页面一进行解析
        # print(response, type(response))
        # from scrapy.http.response.html import HtmlResponse
        # print(response.body_as_unicode())
        a = {}
        a = json.loads(response.text)['data']
        for j in range(0, len(a)):
            sss = a[j]
            a1 = sss['cah']
            a2 = sss['cajlb']
            a3 = sss['cbt']
            a4 = sss['cslfyMc']
            a5 = sss['cygMc']
            a6=response.url
            a7='***'+sss['cBh']#从第一个页面取得的链接
            yield scrapy.Request(url=a7,headers=headers},callback=self.two_parse)#对这个链接进行get请求

    def two_parse(self,response):#对页面二进行解析
        Soup=BeautifulSoup(response.text,'lxml')
        alist=Soup.find('div',class_='fd-fix')
        a1=alist.find('h2').text
        a2=alist.find('h5').text
        a3=alist.find('div',class_='fd-alt-all').text
        a4=''
        a5=''
        a6=''
        item=P1Item(a1=a1,a2=a2,a3=a3,a4=a4,a5=a5,a6=a6)
        yield item #返回要取得的值

你可能感兴趣的:(spiders框架——post 和 页面的二次跳转)