在看天涯论坛时,有些帖子很有价值,想把它保存下来,可是好的帖子动辄几百页,其中又掺杂了很多灌水和撕逼,所以我编写了一个python小程序,将一个帖子中楼主所发的内容抓取下来,挺好用的。
唯一的缺陷是图片还没法抓取,以后有空再改进。
下面是程序代码,注释都在里面了:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# 天涯帖子下载,“只看楼主”
from bs4 import BeautifulSoup
import codecs
import re
import requests
import time
# 主下载函数
def download_post(home_page, link_prefix):
# 打开文件
filename = 'D:/temp/post.txt'
f = codecs.open(filename, 'w', 'utf-8')
########## 数据抓取 ##########
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36'}
home_page_file = requests.get(home_page, headers=headers)
home_page_file.encoding = 'utf-8'
home_page_file = home_page_file.text
soup = BeautifulSoup(home_page_file, 'html.parser')
########## 获取帖子总体信息,例如标题、作者、时间等 ##########
# 帖子标题
post_title = soup.select('h1.atl-title > span > span')[0].string.strip()
post_info = soup.select('div.atl-info > span')
# 帖子作者
post_author = post_info[0].select('a')[0].string.strip()
# 帖子时间
post_time = post_info[1].string[3:].strip()
f.write(u'帖子标题:' + post_title + '\r\n')
f.write(u'帖子作者:' + post_author + '\r\n')
f.write(u'帖子时间:' + post_time + '\r\n\r\n')
f.write(u'#################### 我是分隔线 ####################\r\n\r\n')
########## 获取帖子的内容 ##########
post_items = soup.select('div.atl-item')
for i in range(len(post_items)):
# 主帖,主帖和跟帖的区别是跟帖有js_username属性
if not post_items[i].get('js_username'):
post_content = unicode(post_items[i].select('div.bbs-content')[0])
post_content = process_post_content(post_content)
if post_content:
f.write(u' ' + post_content + '\r\n\r\n')
f.write(u'#################### 我是分隔线 ####################\r\n\r\n')
# 跟帖
else:
# 只看楼主
if post_items[i].get('js_username') == post_author:
post_time = post_items[i].select(
'div.atl-info > span')[1].string.strip()
f.write(u' ' + post_time + '\r\n\r\n')
post_content = unicode(
post_items[i].select('div.bbs-content')[0])
post_content = process_post_content(post_content)
if post_content:
f.write(u' ' + post_content + '\r\n\r\n')
f.write(
u'#################### 我是分隔线 ####################\r\n\r\n')
########## 处理翻页 ##########
# 获取翻页链接
next_page = soup.select('div.atl-pages > form > a.js-keyboard-next')
while next_page:
next_page_href = next_page[0].get('href')
########## 数据抓取 ##########
home_page_file = requests.get(
link_prefix + next_page_href, headers=headers)
home_page_file.encoding = 'utf-8'
home_page_file = home_page_file.text
soup = BeautifulSoup(home_page_file, 'html.parser')
########## 获取帖子的内容 ##########
post_items = soup.select('div.atl-item')
for i in range(len(post_items)):
# 只看楼主
if post_items[i].get('js_username') == post_author:
post_time = post_items[i].select(
'div.atl-info > span')[1].string.strip()
f.write(u' ' + post_time + '\r\n\r\n')
post_content = unicode(
post_items[i].select('div.bbs-content')[0])
post_content = process_post_content(post_content)
if post_content:
f.write(u' ' + post_content + '\r\n\r\n')
f.write(
u'#################### 我是分隔线 ####################\r\n\r\n')
# 获取翻页链接
next_page = soup.select('div.atl-pages > form > a.js-keyboard-next')
# 关闭文件
f.close()
# 帖子内容处理函数
def process_post_content(post_content):
post_content = post_content.replace(
'', '')
post_content = post_content.replace('', '')
post_content = post_content.replace('', '')
post_content = post_content.replace('
', '\r\n')
post_content = post_content.strip()
return post_content
# 天涯帖子首页,每次抓取帖子前修改这个值
home_page = 'http://bbs.tianya.cn/post-worldlook-703166-1.shtml'
# 翻页时,各页面链接需要加上的前缀
link_prefix = 'http://bbs.tianya.cn'
download_post(home_page, link_prefix)