爬取新闻网页的信息

import requests

from bs4 import BeautifulSoup

from datetime import datetime

import re

import json

'''

将抓取评论数的方法整理成函式

'''

commentURL= 'http://comment.sina.com.cn/page/info?version=1&\

format=json&channel=gn&newsid=comos-{}&group=0&compress=0&\

ie=gbk&oe=gbk&page=1&page_size=3&t_size=3&\h_size=10'

#抓取评论数

def getCommentCounts(newsurl):

    m = re.search('doc-i(.*).shtml',newsurl)

    newsid = m.group(1)

    #将newsid塞进commentURL中

    setid = commentURL.format(newsid)

    #获取评论

    req=requests.get(setid)

    jd = json.loads(req.text)['result']['count']['total']

    return jd

'''

将抓取抓取内文信息的方法整理成函式

'''

def getNewsDetail(newsurl):

    result={}

    res=requests.get(newsurl)

    res.encoding='utf-8'

    html=BeautifulSoup(res.text,'html.parser')

    #获取标题

    result['title']=html.select('.main-title')[0].text

    #获取新闻时间

    strdate=html.select('.date')[0].text.strip()

    result['date']=datetime.strptime(strdate,'%Y年%m月%d日 %H:%M')

    #获取新闻来源

    result['source'] = html.select('.date-source a')[0].contents[0]

    #获取新闻内容

    result['article'] = html.select('.article p')[:-1].text.strip()

    #获取作者

    result['author'] = html.select('.show_author')[0].text.strip('责任编辑:')

    #获取评论数

    result['comments'] = getCommentCounts(newsurl)

    return result

'''

根据分页批量抓取新闻

'''

def getNewsUrl(url):

    newsDetails={}

    req = requests.get(url)

    req.encoding='utf-8'

    fd = json.loads(req.text)['result']['data']

    for newsurls in fd:

        newsDetails.append(getNewsDetail(newsurls['url']))

    return newsDetails

if "__name__" == "__main__":

    ''' 第一二部分用

    #新闻链接

    news = 'https://news.sina.com.cn/c/xl/2019-02-21/doc-ihrfqzka7703333.shtml'

    getCommentCounts(news)

    '''

    newsurl = 'https://feed.sina.com.cn/api/roll/get?pageid=121&\

                lid=1356&num=20&versionNumber=1.2.4&\page=2&encode=utf-8'

    getNewsUrl(newsurl)

来自于视频学习:https://study.163.com/course/courseMain.htm?courseId=1003285002

你可能感兴趣的:(爬取新闻网页的信息)