新浪国内新闻-python爬虫实践

这次的爬虫用到了正则,OrderedDict和pandas,同时增加了time.sleep以免爬速过快导致IP被服务器封杀。本次并没有使用新的技术,主要是对数据做了一点清洗,感觉可以往代理IP+多线程方向发展,and Pandas真是一个非常适合数据挖掘和分析的库啊!
冥冥中有种感觉,如果新闻多抓一点,能分析出很多很有趣的结论,诸如社会热点,政治局势等等。
好了,直接上代码。

#!/usr/bin/env python
# -*-coding=utf-8
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
from collections import OrderedDict

newsAPI = 'http://api.roll.news.sina.com.cn/zt_list'
CommentsAPI='http://comment5.news.sina.com.cn/page/info'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
           'Accept-Language':'zh-CN,zh;q=0.8'}
paramsNewsAPI={'channel':'news','cat_1':'gnxw',
        'cat_2':'=gdxw1||=gatxw||=zs-pl||=mtjj',
        'level':'=1||=2',
        'show_ext':1,
        'show_all':1,
        'show_num':22,
        'tag':'1',
        'format':'json',
        'page':1,
        }
paramsCommentsAPI={'version':'1',
        'format':'json',
        'channel':'gn',
        'newsid':'comos-ifymenmt5700372',
        'group':None,
        'compress':0,
        'ie':'utf-8',
        'oe':'utf-8',
        'page':1,
        'page_size':20
        }


def getnewsInfo(newslist,curSession):
    newsInfolist = []
    for news in newslist['result']['data']:
        newsInfo = OrderedDict()
        newsInfo['title'] = news['title']
        newsInfo['source'] = news['media_name']    
        newsInfo['link'] = news['url']
        newsInfo['newsid'] = re.search(r'doc-(.*).shtml',news['url']).group(1)
        newsInfo['keywords'] = news['keywords'].split(',')
        getNewsContent(newsInfo,curSession)
        getNewsComments(newsInfo,curSession)
        #防止爬速度过快
        time.sleep(0.1)
        newsInfolist.append(newsInfo)

    return newsInfolist


def getNewsContent(newsInfo,curSession):
    resp = s.get(newsInfo['link'])
    resp.encoding='utf-8'
    resp.raise_for_status()
    bsobj = BeautifulSoup(resp.text,'html.parser')
    newsInfo['date'] = bsobj.find('span',{'class':'time-source'}).contents[0].strip()
    newsInfo['editor'] = bsobj.find('p',{'class':'article-editor'}).string.split(':')


def getNewsComments(newsInfo,curSession):
    paramsCommentsAPI['newsid'] = "comos-{}".format(newsInfo['newsid'][1::])
    resp = s.get(CommentsAPI,params=paramsCommentsAPI)
    result = resp.json()
    if result['result']['status']['code'] != 0:
        newsInfo['comments'] = 0
        newsInfo['commenter'] = 0
    else:
        newsInfo['comments'] = result['result']['count']['show'] 
        newsInfo['commenter'] = result['result']['count']['total'] 


newsInfo=[]
with requests.Session() as s:
    s.headers.update(headers)
    #按需调整页码
    for page in range(1,3): 
        resp = s.get(newsAPI,params=paramsNewsAPI)
        resp.encoding = 'utf-8'
        newsInfo.extend(getnewsInfo(resp.json(),s))

df = pd.DataFrame(newsInfo)
df.to_excel('新浪新闻.xlsx')

你可能感兴趣的:(新浪国内新闻-python爬虫实践)