scrapy爬取新浪微博关键字微博

#weibo.py
# -*- coding: utf-8 -*-
from scrapy import Spider, Request, FormRequest
import re
from weibosearch.items  import WeiboItem
import  json


class WeiboSpider(Spider):
    name = "weibo"
    allowed_domains = ["weibo.cn"]
    search_url = 'https://weibo.cn/search/mblog'
    max_page=100

    cookie_raw=''#插入自己的cookie
    headers={
        'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': ' gzip, deflate, br',
        'Accept-Language': ' zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cache-Control': ' max-age=0',
        'Connection': ' keep-alive',
        'Content-Type': ' application/x-www-form-urlencoded',
        'Host': ' weibo.cn',
        'Origin': ' https://weibo.cn',
        'Upgrade-Insecure-Requests': ' 1',
        'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    }


    def start_requests(self):
        keyword='重庆公交'
        url = '{url}?keyword={keyword}&sort=hot'.format(url=self.search_url, keyword=keyword)

        cookie = {}
        for i in self.cookie_raw.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]

        for page in range(self.max_page + 1):
            data = {
                'mp': str(self.max_page),
                'page': str(page)
            }
            yield FormRequest(url, callback=self.parse_index,cookies=cookie,headers=self.headers, formdata=data)

    def parse_index(self, response):
        weibos = response.xpath('//div[@class="c" and contains(@id, "M_")]')
        print(len(weibos), weibos)
        for weibo in weibos:
            is_forward = bool(weibo.xpath('.//span[@class="cmt"]').extract_first())
            if is_forward:
                detail_url = weibo.xpath('.//a[contains(., "原文评论[")]//@href').extract_first()
            else:
                detail_url = weibo.xpath('(.//a[contains(., "评论[")]/@href)').extract_first()
            yield Request(detail_url, callback=self.parse_detail)

    def parse_detail(self, response):
        url = response.url
        content = ''.join(response.xpath('//div[@id="M_"]//span[@class="ctt"]//text()').extract())
        #id = re.search('comment\/(.*?)\?', response.url).group(1)
        comment_count = response.xpath('//span[@class="pms"]//text()').re_first('评论\[(.*?)\]')
        forward_count = response.xpath('//a[contains(., "转发[")]//text()').re_first('转发\[(.*?)\]')
        like_count = response.xpath('//a[contains(., "赞[")]//text()').re_first('赞\[(.*?)\]')
        posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()').extract_first(default=None)
        user = response.xpath('//div[@id="M_"]/div[1]/a/text()').extract_first()
        weibo_item = WeiboItem()
       # weibo_item['id']=id
        weibo_item['user'] = user
        weibo_item['content'] = content
        weibo_item['forward_count'] = forward_count
        weibo_item['comment_count'] = comment_count
        weibo_item['like_count'] = like_count
        weibo_item['posted_at'] =posted_at
        weibo_item['url'] = url


        yield weibo_item

#items.py

from scrapy import Item, Field


class WeiboItem(Item):
    #table_name = 'weibo'

   # id = Field()
    user = Field()
    content = Field()
    forward_count = Field()
    comment_count = Field()
    like_count = Field()
    posted_at = Field()
    url = Field()
    crawled_at = Field()

 

你可能感兴趣的:(python)