本文依然以虎扑为例,将爬取的数据保存到mysql数据库中:
首先,导入相应的库
import requests
from bs4 import BeautifulSoup
import time
import random
import MySQLdb
定义方法爬取数据
def get_information(page=0):
url = 'https://bbs.hupu.com/bxj-postdate-' + str(page+1)
headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Referer": "https://bbs.hupu.com/bxj"
}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.content.decode("utf-8"),"html.parser")
out = soup.find("ul",attrs={"class":"for-list"})
datas = out.find_all('li')
datas_list = []
try:
for data in datas:
title = data.find('a', attrs={"class":"truetit"}).text.split()[0]
artical_link = "https://bbs.hupu.com" + data.find('a', attrs={"class": "truetit"}).attrs['href']
author = data.find('a', class_="aulink").text
author_link = data.find('a', class_="aulink").attrs['href']
create_time = data.find('a', style="color:#808080;cursor: initial; ").text
reply_num = data.find('span', class_='ansour box').text.split("/")[0]
lastest_reply = data.find('span', class_='endauthor').text
lastest_reply_time = data.find('div', class_='endreply box').a.text
datas_list.append({"title":title,"artical_link":artical_link,"author":author,"author_link":author_link,"create_time":create_time,"reply_num":reply_num,"lastest_reply":lastest_reply,"lastest_reply_time":lastest_reply_time})
except:
None
return datas_list
运行并保存到mysql中:
if __name__ == "__main__":
conn = MySQLdb.connect(host='localhost', user='root', password='12345678', db='scraping',charset="utf8") # 创建连接
cur = conn.cursor() # 创建游标
for page in range(10):
datas = get_information(page)
for data in datas:
cur.execute("INSERT INTO hupu_datas (title, artical_link, author, author_link,create_time, reply_num, lastest_reply, lastest_reply_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)",(data['title'], data['artical_link'], data['author'], data['author_link'], data['create_time'],data['reply_num'], data['lastest_reply'], data['lastest_reply_time']))
print("正在爬取第%s页"%(page+1))
time.sleep(1+random.random()*2)
cur.close() # 关闭游标
conn.commit() # 提交事务
conn.close() # 关闭连接