本文内容围绕获取虎扑步行街论坛上所有帖子的数据开展的,爬取的内容包括帖子名称、帖子链接、作者、作者链接、创建时间、回复数、浏览数、最后回复用户和最后回复时间,将爬取的数据分别保存在MongoDB和MySQL里
网页地址: https://bbs.hupu.com/bxj
在运行代码前记得启动MySQL、MongoDB,还需要在MySQL中创建所使用到的库与表
在MySQL中可能将要使用到的命令:
创建mypython这个库
create database mypython;
使用mypython
use mypython;
创建数据表hupu
create table hupu(
namevarchar(100) null,
urlvarchar(50) null,
authorvarchar(100) null,
author_hrefvarchar(50) null,
forum_timevarchar(30) null,
reply_countsvarchar(10) null,
browse_countsvarchar(10) null,
endreplynamevarchar(100) null,
endreplytimevarchar(30) null);
具体python代码如下:
import requests
import time
import pymysql
import pymongo
from lxml import etree
#创建连接 默认IP(本机)为localhost,端口号为27017
client = pymongo.MongoClient('localhost', 27017)
# 连接数据库,并创建pythonwork库
mydb = client['pythonwork']
#连接表,并创建hupu集合
dataline = mydb['hupu']
#连接mysql数据库
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='mypython',port=3306,charset='utf8')
#光标对象
cursor=conn.cursor()
#请求头
headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'}
#获取各页全部文章的链接
def get_href(url):
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text)
hrefs=selector.xpath('//ul/li/div/div[@class="post-title"]/a/@href')
for href in hrefs:
get_forum("https://bbs.hupu.com"+href)
def get_forum(url):
html=requests.get(url,headers=headers)
result=etree.HTML(html.text)
try:
name=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[2]/div/h1/text()')[0] #获取帖子名
author=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[4]/div/div[1]/div/div[1]/div/a/text()')[0] #获取作者名
author_href=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[4]/div/div[1]/div/div[1]/div/a/@href')[0] #获取作者链接
forum_time=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[4]/div/div[1]/div/div[1]/div/span[3]/text()')[0] #获取创建时间
reply_counts=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[2]/div/span[1]/text()')[0] #获取回复数
browse_counts=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[2]/div/span[3]/text()')[0] #获取浏览数
#判断回复数是否大于20,若大于20则需要进入最后的回复页面进行爬取
if int(reply_counts)>20:
#获取回复一共有几页
endpage=result.xpath('//ul[@class="hupu-rc-pagination"]/li/a/text()')[-1]
#构造回复的最后一页链接
endurl=url[:-5]+'-'+endpage+'.html'
#爬取最后一页回复信息
endhtml=requests.get(endurl,headers=headers)
endresult=etree.HTML(endhtml.text)
#获取最后回复用户名与时间
endreplyname=endresult.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/a/text()')[0]
endreplytime=endresult.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/span/text()')[0]
#如果回复总数小于或等于20,那么就可直接在当前页面爬取最后回复者的信息
else:
endreplyname=result.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/a/text()')[0]
endreplytime=result.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/span/text()')[0]
#将爬取的数据存入字典中
info={
'name':name,
'url':url, #帖子链接
'author':author,
'author_href':author_href,
'forum_time':forum_time,
'reply_counts':reply_counts,
'browse_counts':browse_counts,
'endreplyname':endreplyname,
'endreplytime':endreplytime
}
print('恭喜:该条帖子爬取成功!')
#存入mongodb
dataline.insert_one(info)
#存入mysql
cursor.execute("insert into hupu(name,url,author,author_href,forum_time,reply_counts,browse_counts,endreplyname,endreplytime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(name,url,author,author_href,forum_time,reply_counts,browse_counts,endreplyname,endreplytime))
except:
pass
if __name__ == '__main__':
urls = ['https://bbs.hupu.com/bxj-{}'.format(str(i)) for i in range(1,11)]
for url in urls:
get_href(url)
time.sleep(1)
#统一提交
conn.commit()
#关闭链接
conn.close()