最近学习python,就利用python做一个简单的爬虫项目巩固自己学习的知识。
本人实现的思路:
1. 取得包含帖子主题的全部页面;
2. 从页面中提取帖子URL保存到队列中,每个URL就是一个帖子;
3. 从队列中依次提取URL,获取帖子中的内容。
具体实现:
首先,获取页面的全部内容。
本来我使用的是urllib,后面换成了requests,个人感觉这个模块方便了很多。
def getPageAllContent(base_url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
}
try:
request = requests.get(base_url)
# request = urllib.request.Request(base_url,headers=header)
# reponse = urllib.request.urlopen(request).read()#取得网页内容
# print(reponse)
return request
except Exception as e:
if hasattr(e,"reason"):
print(e.reason)
贴吧每页包含50个帖子,可以通过循环获取全部URL。
for i in range(0,pageNum):
list_url.append('https://tieba.baidu.com/f?kw=' + tiebaName[0] + '&ie=utf-8&pn=' + str(i*50))
该贴吧页面的总数用主题数/50,利用正则表达式提取主题数。
def getTitlePageNumber(base_url):
content = getPageAllContent(base_url)
titleNum = re.findall(r'共有主题数([0-9]+)个',content.text)
# print(titleNum[0])
pageNum = math.floor(int(titleNum[0])/50)
# print(pageNum)
return int(pageNum)
利用xpath获取帖子的URL,URL中如果包含中文,则需要先进行转码。
def getSingleUrl(list_url):
final_url = []
for url in list_url:
print(url)
urlcode = quote(url, safe='/:?=&')
content = getPageAllContent(urlcode)
# print(content)
txt = content.text
html_obj = html.fromstring(txt)
urls = html_obj.xpath('//*[@id="thread_list"]/li/div/div[2]/div/div/a[@rel]/@href')
for _url in urls:
#print(_url)
final_url.append("https://tieba.baidu.com" + _url)
# print(final_url)
return final_url
从URL中获取每个楼层内容
def getTitlePageContent(single_url):
page = getContentPageNumber(single_url)
# print(page)
for i in range(1,int(page)+1):
page_url = "%s?pn=%s" % (single_url,i)
content = getPageAllContent(page_url)
txt = content.text
content_obj = html.fromstring(txt)
if i == 1:
tieTitle = content_obj.xpath("//*[@id='j_core_title_wrap']/h3/@title")
saveTitle(i,str(tieTitle))
tr_list = content_obj.xpath("//*[@id='j_p_postlist']//div[@data-field]/div")
# print(type(tr_list))
for j,tr in enumerate(tr_list):
# print(type(tr))
if j
以下是全部代码
#-*- coding:utf-8 -*-
# import urllib.request
import requests
import math
import re
from lxml import html
from urllib.parse import quote
#获取待爬取页面URL
def getMainPageUrl(base_url):
list_url = []
pageNum = getTitlePageNumber(base_url)
tiebaName = re.findall(r'f?kw=(.+)&ie',base_url)
for i in range(0,pageNum):
list_url.append('https://tieba.baidu.com/f?kw=' + tiebaName[0] + '&ie=utf-8&pn=' + str(i*50))
# print(list_url[0])
return list_url
#取得每个帖子对应的URL
def getSingleUrl(list_url):
final_url = []
for url in list_url:
print(url)
urlcode = quote(url, safe='/:?=&')
content = getPageAllContent(urlcode)
# print(content)
txt = content.text
html_obj = html.fromstring(txt)
urls = html_obj.xpath('//*[@id="thread_list"]/li/div/div[2]/div/div/a[@rel]/@href')
for _url in urls:
#print(_url)
final_url.append("https://tieba.baidu.com" + _url)
# print(final_url)
return final_url
#获取整个页面内容
def getPageAllContent(base_url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
}
try:
request = requests.get(base_url)
# request = urllib.request.Request(base_url,headers=header)
# reponse = urllib.request.urlopen(request).read()#取得网页内容
# print(reponse)
return request
except Exception as e:
if hasattr(e,"reason"):
print(e.reason)
#获取帖子内页面内容
def getTitlePageContent(single_url):
page = getContentPageNumber(single_url)
# print(page)
for i in range(1,int(page)+1):
page_url = "%s?pn=%s" % (single_url,i)
content = getPageAllContent(page_url)
txt = content.text
content_obj = html.fromstring(txt)
if i == 1:
tieTitle = content_obj.xpath("//*[@id='j_core_title_wrap']/h3/@title")
saveTitle(i,str(tieTitle))
tr_list = content_obj.xpath("//*[@id='j_p_postlist']//div[@data-field]/div")
# print(type(tr_list))
for j,tr in enumerate(tr_list):
# print(type(tr))
if j([0-9]+)个',content.text)
# print(titleNum[0])
pageNum = math.floor(int(titleNum[0])/50)
# print(pageNum)
return int(pageNum)
#获取帖子内内容页面数
def getContentPageNumber(url):
content = getPageAllContent(url)
pageNum = re.findall(r'回复贴,共([0-9]+)页',content.text)
# print(pageNum[0])
return pageNum[0]
def main(base_url):
# getPageAllContent(base_url)
# getTitlePageNumber(base_url)
# getContentPageNumber(base_url)
# getMainPageUrl(base_url)
# list_url = getMainPageUrl(base_url)
# getSingleUrl(list_url)
# getTitlePageContent(base_url)
# 获取待爬取贴吧全部页面URL
list_url = getMainPageUrl(base_url)
# 从每个页面URL中获取单个帖子URL
single_url = getSingleUrl(list_url)
# 依次访问单个帖子URL,取得每个帖子的内容
for j,surl in enumerate(single_url):
getTitlePageContent(surl)
print('正在写入第' + str(j+1) + '条...')
base_url = 'https://tieba.baidu.com/f?kw=北宋&ie=utf-8&pn=0'
# base_url = 'https://tieba.baidu.com/p/3864746283'
if __name__ == '__main__':
main(base_url)
在实际运行中,程序还是存在一些小bug,爬取效率也需要进一步优化