北邮数据仓库数据获取

数据获取

  • 这个文章主要是对使用scrapy框架对新浪新闻进行获取,最终爬取两百多万条,但是其中有两类的数据比较少,在下面主要的主要的是爬虫部分数据的代码,最终的完整代码如果有需要私信我,我放到github上。
import scrapy
import re
import time
import datetime
import copy
from lxml import etree
import json
import requests
headers={
     
    'cookie': 'SINAGLOBAL=219.143.103.186_1574762579.560816; SUB=_2AkMqu5Muf8NxqwJRmPoWxGPlZYt2zgvEieKc52L1JRMyHRl-yD9jqlI5tRB6ATu9wbi9kLo8OxoHjeZApvuLo5C-313s; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WF-3HVKvVX-Q5jxCyV-czu2; UOR=news.hao123.com,news.sina.com.cn,; lxlrttp=1572512346; SGUID=1576150846061_21239829; UM_distinctid=16ef9e965c42b-086800c19d1649-32365f08-100200-16ef9e965c52e6; U_TRS1=000000ba.defc709d.5df245d9.4120fff8; rotatecount=2; Apache=220.202.152.119_1576653423.258154; FEED-MIX-SINA-COM-CN=; ULV=1576653263608:18:18:10:220.202.152.119_1576653423.258154:1576653218143; co=10.13.64.57_1576653.469',
    'referer': 'https://news.sina.com.cn/roll/',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',

}
class DatacrawlSpider(scrapy.Spider):
    name = 'datacrawl'
    allowed_domains = ['news.sina.com.cn']
    start_urls = ['http://news.sina.com.cn/']
    def start_requests(self):
        s_static_time = '2020-10-23 0:0:0'
        static_time = datetime.datetime.strptime(s_static_time, '%Y-%m-%d %H:%M:%S')
        for j in range(1000,1500):
            start_time = static_time -datetime.timedelta(days=j)
            end_time = start_time - datetime.timedelta(days=1)
            end_date = datetime.datetime.strftime(end_time,"%Y-%m-%d")
            s_start_time = datetime.datetime.strftime(start_time, "%Y-%m-%d %H:%M:%S")
            s_end_time = datetime.datetime.strftime(end_time, "%Y-%m-%d %H:%M:%S")
            T_start_time = time.strptime(s_start_time, "%Y-%m-%d %H:%M:%S")
            T_end_time = time.strptime(s_end_time, "%Y-%m-%d %H:%M:%S")
            mk_start_time = int(time.mktime(T_start_time))
            mk_end_time = int(time.mktime(T_end_time))
            mk_start_time = str(mk_start_time)
            mk_end_time = str(mk_end_time)
            print("第"+str(j)+'次')
            url = 'https://feed.mix.sina.com.cn/api/roll/get?pageid=153&etime='+mk_end_time+'&stime='+mk_start_time+'&ctime='+mk_start_time+'&date='+end_date
            # 国内,国际,社会,体育,娱乐,军事,科技,财经,股市,美股
            # Lid_list = [2510,2511,2669,2512,2513,2514,2515,2516,2517,2518]
            Lid_list = [2514]

            for i in range(0,1):
                index = i
                lid = str(Lid_list[index])
                for k in range(0,10):
                    surl = url+'&lid='+lid+'&k=&num=50&page='+str(k)
                    print(surl)
                    yield scrapy.Request(url=surl, callback=self.parse_url,meta={
     'index':copy.deepcopy(index)},dont_filter=True)
    def parse_url(self,response):
        index = response.meta['index']
        # print("这个是",index)
        ret =response.json()['result']['data']
        if(ret==[]):
            print("这个网页中没有数据")
            return
        list1 = []
        for li in  ret:
            url = li['url']
            yield scrapy.Request(url=url,callback=self.parse,meta={
     'index':copy.deepcopy(index),'url':copy.deepcopy(url)},dont_filter=True)
    def parse(self, response):
        item = {
     }
        index =response.meta['index']
        url = response.meta['url']
        news_id = re.findall(r'/([\w+_-]+).shtml',url)[0]
        # print("新闻id",news_id)
        title = response.xpath('//h1[@class="main-title"]/text()').extract_first()
        if(title==None):
            title = response.xpath('//h1[@id="artibodyTitle"]/text()').extract_first()
        con = response.xpath('//div[@id="article"]')
        if (con==[]):
            con =response.xpath('//div[@id="artibody"]')
        if (con==[]):
            con = response.xpath('//div[@class="article"]')
        content = con.xpath('string(.)').extract_first()
        item["news_id"] = news_id
        item["class_num"] = index
        item["title"] = title
        item["content"] = content
        yield item

你可能感兴趣的:(数据仓库)