python3爬取新浪新闻文章内容代码

import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime
import re

#当页跳转url
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}'
url.encode('utf-8')
# #形成多页链接,并且解析每页的所有链接及每个链接的内容详情

#该篇文章评论url
commenturl = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=sh&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3'


##获取评论总数函数
def getcomments(newsurl):
    m=re.search('doc-i(.*).shtml',newsurl)
    newsid=m.group(1)
    comments = requests.get(commenturl.format(newsid))
    jd1 = json.loads(comments.text.lstrip('jsonp_1515479412878(').rstrip(')'))
    jd2=jd1['result']['count']['total']
    return jd2

##获取每页详细内容函数
def getnewsdetail(newsurl):
    result = {}
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')

    #标题
    result['title'] = soup.select('.main-title')[0].text.strip()  # strip去掉/3000

    # 时间   时间格式转换
    timesource = soup.select('.date')[0].text.strip()
    dt = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')
    dt = dt.strftime('%Y-%m-%d')
    result['time'] = dt

    # 新闻来源
    result['newssource'] = soup.select('.source')[0].text

    #评论
    result['comments'] = getcomments(newsurl)

    #url
    result['url'] = newsurl


    #内容 需要将每一段加入list中
    article=[]
    # ' '.join(article) #段落段落之间以空格隔开
    result['context']= '\n'.join([p.text.strip() for p in soup.select('#article p')[:-1]])

    return result

def parselistlinks(url):
    newsdetails =[]
    res = requests.get(url)
    jd = res.text.lstrip(' newsloadercallback(').rstrip(');')  # 这种才是json格式
    jd = json.loads(jd)#得到每一跳转页的所有链接及其他

    for ent in jd['result']['data']:  #读取每页每一个链接的详细信息
        # result_1=getnewsdetail(ent['url']) #即该页所有详细信息
        # result_2 = {}
        # result_2['url']=ent['url']
        # merge = dict(result_1)
        # merge.update(result_2)
        # newsdetails.append(merge) ###两个字典的合并
        newsdetails.append(getnewsdetail(ent['url']))
    return newsdetails

news_total=[]
#得到前10页的文章url
for i in range(1,11):
    newsurl=url.format(i) # 每个跳转页
    newsary = parselistlinks(newsurl)#得到每页的所有文章的url及解析每个url的爬取内容
    news_total.extend(newsary)

df = pd.DataFrame(news_total)
# print(df.head())
print(df.to_csv('新浪微博.csv'))
#df.to_csv('新浪微博')
python3爬取新浪新闻文章内容代码_第1张图片

你可能感兴趣的:(python3爬取新浪新闻文章内容代码)