python爬取缠中说sina博客历史所有文章及回复（三）

缠师最早的天涯论坛爬取：https://www.jianshu.com/p/81a5da4fa161
python爬取缠中说禅天涯论坛历史所有文章及回复（一）
https://www.jianshu.com/p/2bcfab6c906b
python爬取缠中说禅凯迪论坛历史所有文章及回复（二）

前期准备

目标地址
注意：第一篇文章：无话可说，下面有很多留言，因为缠师不更新了，大家都在这盘文章下留了很多言；由于量太大，建议单独爬取第一篇
数量：全量（全部文章1073篇）
输出doc或者txt：要求按时间排序、文件名为文章名称

爬取思路

翻页

翻页.png

获取每页详情
- 每篇文章标题（生成txt的时候，不允许有*+？等符号，需要处理，有点坑的事，缠师有时候中英文格式不区分，所以最稳妥的是非法符号的中英文格式都匹配一把）
- 每篇文章url

单页信息

点击评论“加载更多”出现ajax请求
根据首首页的ajax请求，pagesize和total，计算出评论内容总共需要翻几页

单页的ajax.png
解析单条评论，遍历你需要的json字段

解析json

小结

新浪还是挺好处理的，逻辑不复杂
txt的标题注意处理非法字符
记得开多线程，记得开多线程，记得开多线程!!!!我是单线程的代码，跑了很久，之前调试的时候遇到非法字符又重头开始，很坑
我1月份跑的22页还是可以打开的，现在只能打开20页，被河蟹了很多，我的也不全，在百度云需要的自提。
对你有帮助的麻烦点个赞

代码

#!/usr/bin/python
# -*- coding: utf-8 -*-

from pyquery import PyQuery as pq
import codecs
import re
import os
import requests
import time

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36'}
page_size=50


def makedir(dirName):
    curPath=os.getcwd()
    tempPath=dirName
    targetPath='G://'+tempPath
    #print(targetPath)
    if not os.path.exists(targetPath):
        os.makedirs(targetPath)
    else:
        print('路径已经存在！')
    return targetPath

def get_artList(file_path):
    base_url = 'http://blog.sina.com.cn/s/articlelist_1215172700_0_'
    for i in range(22,23):
        url = base_url + str(i) + '.html'
        print url,file_path
        get_artInfo(url,file_path)

def get_artInfo(url,file_path):
    atr_html = requests.get(url, headers=headers)
    atr_html.encoding = 'utf-8'
    atr_html = atr_html.text
    page_doc = pq(atr_html)
    for atr in page_doc('div.SG_connBody > div.article_blk> div.articleList').items('div[class="articleCell SG_j_linedot1"]'):
        atr_url = atr('p>span>a').attr('href')
        atr_name = atr('p>span>a').text().replace('/','-').replace('\\','-').replace(u'：','-').replace('*','').replace(u'？','').replace(u'"','').replace(u'<','(').replace(u'>',')').replace(u'?','').replace(u':','')
        atr_time = atr('p[class="atc_info"]').text().replace(':','-')
        atr_info = atr('p[class="atc_info"]>span[class="atc_data"]')
        atr_id = atr_info.attr('id').replace('count_','')
        #print atr_url,atr_name,atr_time,atr_id
        if atr_name ==u'无话可说':
            print '跳过第一篇文章无话可说'
        else:
            print atr_url, atr_name, atr_time, atr_id
            get_atrDetail(atr_url,atr_name,atr_time,atr_id,file_path)
       # print atr_url,atr_name
        #print atr.text()

def get_atrDetail(atr_url,atr_name,atr_time,atr_id,file_path):
    page_file = requests.get(atr_url, headers=headers)
    page_file.encoding = 'utf-8'
    page_file = page_file.text
    doc = pq(page_file)
    content = doc('#sina_keyword_ad_area2').text()
    post_title = doc('h2').text()
    print post_title
    try:
        post_tag= u'标签：'+doc('#sina_keyword_ad_area > table >tr>td').text().replace('\n','').split(u'标签：')[1]
    except:
        post_tag=u''
    reply_fisrtPage = 1
    reply_api = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid=%s&page_size=%d&oe=utf-8&list=asc&thread=1&page=%d' % (atr_id,page_size,reply_fisrtPage)
    print reply_api
    reply_file = requests.get(reply_api)
    #reply_file.encoding = 'utf-8'
    #reply_file = page_file.text
    reply_json=reply_file.json()
    reply_showNum = reply_json['result']['count']['show']
    reply_page = reply_showNum/page_size
    print reply_showNum,reply_showNum/page_size

    ########## 获取帖子总体信息，例如标题、作者、时间等 ##########
    filename = file_path + '//' + atr_time + atr_name + '.txt'
    f = codecs.open(filename, 'w', 'utf-8')
    f.write(u'帖子标题：' + post_title + '\r\n')
    f.write(u'帖子标签：' + post_tag + '\r\n')
    f.write(u'帖子时间：' + atr_time + '\r\n')
    f.write(u'帖子地址：' + atr_url + '\r\n')
    f.write(u'帖子正文：' + content + '\r\n')
    f.write(u'#################### 我是分隔线 ####################\r\n\r\n')

    ########## 获取帖子的评论 ##########
    for i in range(1,reply_page+2):
        reply_url = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid=%s&page_size=%d&oe=utf-8&list=asc&thread=1&page=%d' % (atr_id, page_size,i)
        print reply_url
        reply_file = requests.get(reply_url,headers=headers)
        time.sleep(3)
        reply_json = reply_file.json()
        reply_list = reply_json['result']['cmntlist']
        reply_num = len(reply_list)
        for j in range(0,reply_num):
            reply_nick = reply_list[j]['nick']
            reply_content = reply_list[j]['content']
            reply_time = reply_list[j]['time']
            reply_ip = reply_list[j]['ip']
            reply_area = reply_list[j]['area']
            f.write(u'    ' + u'第%d楼：'%((i-1)*reply_num+(j+1)) + reply_nick + '\r\n')
            f.write(u'    ' + u'地址ip：' + reply_area + reply_ip + '\r\n')
            f.write(u'    ' + u'帖子内容：' + reply_content + '\r\n')
            f.write(u'    ' + u'帖子时间：' + reply_time + '\r\n')
            f.write(u'#################### 我是分隔线 ####################\r\n\r\n')
            print  u'第%d楼：'%((i-1)*reply_num+(j+1)),reply_nick


    # 关闭文件
    f.close()



if __name__ == '__main__':
    file_path = makedir(u'缠中说禅新浪test')
    get_artList(file_path)
    #get_artInfo('http://blog.sina.com.cn/s/articlelist_1215172700_0_1.html')
    #get_atrDetail(u'http://blog.sina.com.cn/s/blog_486e105c01000alq.html', u'各位,今天爽了吗?'.replace('?','') ,u'2007-05-16 15-26', u'486e105c01000alq',file_path)

python爬取缠中说sina博客历史所有文章及回复（三）

前期准备

爬取思路

翻页

单页信息

小结

代码

你可能感兴趣的:(python爬取缠中说sina博客历史所有文章及回复（三）)