之前也因为感兴趣, 写过一个抓取桌面天空里面喜欢的动漫壁纸的爬虫代码。这两天突然看到有人写了这么一篇文章: 爬取京东本周热卖商品基本信息存入MySQL
觉得蛮有趣的, 正好临近找工作的季节, 就想着能不能写个爬虫, 把咱们学校bbs上面相关的板块里面的发帖信息给记录下来。
http://bbs.ustc.edu.cn/cgi/bbsbfind?type=1&board=Job&title=&title2=&title3=&userid=&dt=7&og=on&boardordigest=0&labelabc=0
我们发现真正需要变化的是 Job 部分, 将Job 改成 Intern 可以得到实习板块的相关信息。
然后我们查看网页的源码:
由于我们需要得到作者, 日期, 帖子的连接地址, 标题 这四个信息, 很容易分析得到我们所需要使用的正则表达式:
string = "author.*?><a.*?>(.*?)</a>.*?datetime.*?>(.*?)<.*?title.*?<a.*?=(.*?)>(.*?)</a>"
其中第三项得到我们帖子的相应地址, 同样可以分析得到提取帖子内容的正则表达式为:
pattern_str = "WWWPOST(.*?)<br/>--"
至此, 基本分析完毕
UnicodeDecodeError:‘XXX’ codec can’t decode bytes in position错误信息解 决办法
UnicodeEncodeError: ‘ascii’ codec can’t encode characters in position。。
解决方法
有两种办法可以解决
1.换成python 3.x
2.在代码前面加上
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# -*- coding:utf-8 -*-
''' Created on 2016-5-2 获取 BBS 相关板块的信息 @author: ThinkPad User '''
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import urllib
import urllib2
import re
class BBSSpider:
''' classdocs '''
def __init__(self):
#self.baseURL = "http://bbs.ustc.edu.cn/cgi/bbsdoc?board=" + str(board)
self.baseURL = ""
self.enable = True
self.charaterset = "gb2312"
# 获取最近不含回帖的帖子
def getHtml(self, url):
#self.baseURL = "http://bbs.ustc.edu.cn/cgi/bbsbfind?type=1&board=" + str(board) + "&title=&title2=&title3=&userid=&dt=7&og=on&boardordigest=0&labelabc=0"
self.baseURL = url
try:
request = urllib2.Request(self.baseURL)
response = urllib2.urlopen(request)
#print response.read().decode(self.charaterset, 'ignore')
return response.read().decode("gb2312", 'ignore').encode("utf-8")
except urllib2.URLError, e:
if hasattr(e, "reason"):
string = "连接bbs 失败, 原因" + str(e.reason)
print string.encode(self.charaterset)
return None
# 删除 获取的网页内容中的一些噪声
def removeNoise(self, content):
# 去除
removeNBSP = re.compile(r" ")
content = re.sub(removeNBSP, " ", content).strip()
removeAMP = re.compile(r"&")
content = re.sub(removeAMP, "&", content).strip()
removeBR = re.compile(r"<br/>")
content = re.sub(removeBR, "\n", content).strip()
# remove blank line
removeN = re.compile(r"\n{1,}")
content = re.sub(removeN, "\n", content).strip()
return content
# 获取发帖信息
def getItem(self, board):
string = "http://bbs.ustc.edu.cn/cgi/bbsbfind?type=1&board=" + str(board) + "&title=&title2=&title3=&userid=&dt=7&og=on&boardordigest=0&labelabc=0"
content = self.getHtml(string)
if not content:
print "加载页面失败"
return
#string = r"author.*?><a.*?>(.*?)</a>.*?datetime.*?>(.*?)<.*?title><a.*?>(.*?)</a>"
string = r"author.*?><a.*?>(.*?)</a>.*?datetime.*?>(.*?)<.*?title.*?<a.*?=(.*?)>(.*?)</a>"
pattern = re.compile(string, re.S)
res = re.findall(pattern, content)
stories = []
count = 0
for item in res:
text = self.removeNoise(item[3])
stories.append(item[2])
# 获取内容
string_out = str("id:%3d\t发帖人:%20s\t发帖时间:%20s\t发帖标题:%40s" % (count, item[0], item[1], text))
print string_out.encode(self.charaterset)
count += 1
return stories
# 获取详细信息
def getDetails(self, board):
stories = self.getItem(board)
if not stories:
return
total_num = len(stories)
while self.enable:
string_tip = str("\n\n================【请输入需要查看的帖子的id, 按 Q 退出】==============").encode(self.charaterset)
id = raw_input(string_tip)
if id == "Q":
self.enable = False
break
try:
int_id = int(id)
if int_id < 0 or int_id >= total_num:
continue
except:
continue
string = "http://bbs.ustc.edu.cn/cgi/" + stories[int_id]
content = self.getHtml(string)
if not content:
print "获取网页信息失败"
return
pattern_str = "WWWPOST(.*?)<br/>--"
pattern = re.compile(pattern_str, re.S)
res = re.findall(pattern, content)
for item in res:
text = self.removeNoise(item)
print text.encode(self.charaterset)
# 使用常用板块信息
def getBoard(self):
flag = True
self.enable = True
boards = ['Job', 'Intern', 'SecondHand', 'PieBridge', 'Free', 'PMPI', 'Badminton', 'Swimming']
count = 0
for item in boards:
print "id:%d board:%15s" % (count, boards[count])
count += 1
total_num = count
while flag:
self.enable = True
string_tip = str("\n\n===================【请输入需要查看的板块的id, 按 Q 退出】================").encode(self.charaterset)
id = raw_input(string_tip)
if id == "Q":
flag = False
break
try:
int_id = int(id)
if int_id < 0 or int_id >= total_num:
continue
except:
continue
self.getDetails(boards[int_id])
if "__main__" == __name__:
bbs = BBSSpider()
bbs.getBoard()
#bbs.getDetails("Job")