python爬取缠中说sina博客历史所有文章及回复(三)

缠师最早的天涯论坛爬取:https://www.jianshu.com/p/81a5da4fa161
python爬取缠中说禅天涯论坛历史所有文章及回复(一)
https://www.jianshu.com/p/2bcfab6c906b
python爬取缠中说禅凯迪论坛历史所有文章及回复(二)

前期准备

  • 目标地址
  • 注意:第一篇文章:无话可说,下面有很多留言,因为缠师不更新了,大家都在这盘文章下留了很多言;由于量太大,建议单独爬取第一篇
  • 数量:全量(全部文章1073篇)
  • 输出doc或者txt:要求按时间排序、文件名为文章名称

爬取思路

翻页

python爬取缠中说sina博客历史所有文章及回复(三)_第1张图片
翻页.png
  • 获取每页详情
    • 每篇文章标题(生成txt的时候,不允许有*+?等符号,需要处理,有点坑的事,缠师有时候中英文格式不区分,所以最稳妥的是非法符号的中英文格式都匹配一把)
    • 每篇文章url

单页信息

  • 点击评论“加载更多”出现ajax请求
  • 根据首首页的ajax请求,pagesize和total,计算出评论内容总共需要翻几页


    python爬取缠中说sina博客历史所有文章及回复(三)_第2张图片
    单页的ajax.png
  • 解析单条评论,遍历你需要的json字段


    python爬取缠中说sina博客历史所有文章及回复(三)_第3张图片
    解析json

小结

  • 新浪还是挺好处理的,逻辑不复杂
  • txt的标题注意处理非法字符
  • 记得开多线程,记得开多线程,记得开多线程!!!!我是单线程的代码,跑了很久,之前调试的时候遇到非法字符又重头开始,很坑
  • 我1月份跑的22页还是可以打开的,现在只能打开20页,被河蟹了很多,我的也不全,在百度云需要的自提。
  • 对你有帮助的麻烦点个赞

代码

#!/usr/bin/python
# -*- coding: utf-8 -*-

from pyquery import PyQuery as pq
import codecs
import re
import os
import requests
import time

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36'}
page_size=50


def makedir(dirName):
    curPath=os.getcwd()
    tempPath=dirName
    targetPath='G://'+tempPath
    #print(targetPath)
    if not os.path.exists(targetPath):
        os.makedirs(targetPath)
    else:
        print('路径已经存在!')
    return targetPath

def get_artList(file_path):
    base_url = 'http://blog.sina.com.cn/s/articlelist_1215172700_0_'
    for i in range(22,23):
        url = base_url + str(i) + '.html'
        print url,file_path
        get_artInfo(url,file_path)

def get_artInfo(url,file_path):
    atr_html = requests.get(url, headers=headers)
    atr_html.encoding = 'utf-8'
    atr_html = atr_html.text
    page_doc = pq(atr_html)
    for atr in page_doc('div.SG_connBody > div.article_blk> div.articleList').items('div[class="articleCell SG_j_linedot1"]'):
        atr_url = atr('p>span>a').attr('href')
        atr_name = atr('p>span>a').text().replace('/','-').replace('\\','-').replace(u':','-').replace('*','').replace(u'?','').replace(u'"','').replace(u'<','(').replace(u'>',')').replace(u'?','').replace(u':','')
        atr_time = atr('p[class="atc_info"]').text().replace(':','-')
        atr_info = atr('p[class="atc_info"]>span[class="atc_data"]')
        atr_id = atr_info.attr('id').replace('count_','')
        #print atr_url,atr_name,atr_time,atr_id
        if atr_name ==u'无话可说':
            print '跳过第一篇文章无话可说'
        else:
            print atr_url, atr_name, atr_time, atr_id
            get_atrDetail(atr_url,atr_name,atr_time,atr_id,file_path)
       # print atr_url,atr_name
        #print atr.text()

def get_atrDetail(atr_url,atr_name,atr_time,atr_id,file_path):
    page_file = requests.get(atr_url, headers=headers)
    page_file.encoding = 'utf-8'
    page_file = page_file.text
    doc = pq(page_file)
    content = doc('#sina_keyword_ad_area2').text()
    post_title = doc('h2').text()
    print post_title
    try:
        post_tag= u'标签:'+doc('#sina_keyword_ad_area > table >tr>td').text().replace('\n','').split(u'标签:')[1]
    except:
        post_tag=u''
    reply_fisrtPage = 1
    reply_api = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid=%s&page_size=%d&oe=utf-8&list=asc&thread=1&page=%d' % (atr_id,page_size,reply_fisrtPage)
    print reply_api
    reply_file = requests.get(reply_api)
    #reply_file.encoding = 'utf-8'
    #reply_file = page_file.text
    reply_json=reply_file.json()
    reply_showNum = reply_json['result']['count']['show']
    reply_page = reply_showNum/page_size
    print reply_showNum,reply_showNum/page_size

    ########## 获取帖子总体信息,例如标题、作者、时间等 ##########
    filename = file_path + '//' + atr_time + atr_name + '.txt'
    f = codecs.open(filename, 'w', 'utf-8')
    f.write(u'帖子标题:' + post_title + '\r\n')
    f.write(u'帖子标签:' + post_tag + '\r\n')
    f.write(u'帖子时间:' + atr_time + '\r\n')
    f.write(u'帖子地址:' + atr_url + '\r\n')
    f.write(u'帖子正文:' + content + '\r\n')
    f.write(u'#################### 我是分隔线 ####################\r\n\r\n')

    ########## 获取帖子的评论 ##########
    for i in range(1,reply_page+2):
        reply_url = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid=%s&page_size=%d&oe=utf-8&list=asc&thread=1&page=%d' % (atr_id, page_size,i)
        print reply_url
        reply_file = requests.get(reply_url,headers=headers)
        time.sleep(3)
        reply_json = reply_file.json()
        reply_list = reply_json['result']['cmntlist']
        reply_num = len(reply_list)
        for j in range(0,reply_num):
            reply_nick = reply_list[j]['nick']
            reply_content = reply_list[j]['content']
            reply_time = reply_list[j]['time']
            reply_ip = reply_list[j]['ip']
            reply_area = reply_list[j]['area']
            f.write(u'    ' + u'第%d楼:'%((i-1)*reply_num+(j+1)) + reply_nick + '\r\n')
            f.write(u'    ' + u'地址ip:' + reply_area + reply_ip + '\r\n')
            f.write(u'    ' + u'帖子内容:' + reply_content + '\r\n')
            f.write(u'    ' + u'帖子时间:' + reply_time + '\r\n')
            f.write(u'#################### 我是分隔线 ####################\r\n\r\n')
            print  u'第%d楼:'%((i-1)*reply_num+(j+1)),reply_nick


    # 关闭文件
    f.close()



if __name__ == '__main__':
    file_path = makedir(u'缠中说禅新浪test')
    get_artList(file_path)
    #get_artInfo('http://blog.sina.com.cn/s/articlelist_1215172700_0_1.html')
    #get_atrDetail(u'http://blog.sina.com.cn/s/blog_486e105c01000alq.html', u'各位,今天爽了吗?'.replace('?','') ,u'2007-05-16 15-26', u'486e105c01000alq',file_path)

你可能感兴趣的:(python爬取缠中说sina博客历史所有文章及回复(三))