直接上代码,注释写的比较详细
from bs4 import BeautifulSoup
import requests
import re
#import threading
#import want2url
import pandas as pd
from tqdm import tqdm
url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T"\
class douban_crawler():
send_headers = {
"Host": "book.douban.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "close"
}
def __init__(self, url, pages):
"""
:param url: 爬虫的最初界面,决定了要爬的书籍的类别信息
:param pages: 要爬取的页数,豆瓣每页20本书的信息,决定了要爬取的数据量
"""
self.url = url
self.pages = [20*i for i in range(pages)]
self.book_class = ""
self.book_names = []
self.book_nations = []
self.book_writers = []
self.book_scores = []
self.book_comments = []
self.book_sites = []
self.book_pages = []
def generate_urls(self):
idx_urls = []
#正则表达式
page_key = re.compile(r"(\S*\?)")
#利用正则表达式匹配出url的必须部分,后面和控制页数的变量进行拼接成索要检索的所有url列表
#注意利用正则匹配到的返回结果为一个列表,一般需要取出列表中的值进行下面的操作
page_main = page_key.findall(self.url)[0]
#“合成”所有url列表,因为豆瓣的规则是每20本书放在一页中,并且用url中的start关键字控制显示的页数
for i in self.pages:
g_url = page_main+"start="+str(i)+"&type=T"
idx_urls.append(g_url)
return idx_urls
def open_url(self, url=None):
#如果不给需要打开的url则自动打开最初始界面(对象初始化给的界面)
if url == None:
#对网站发起get请求
resp = requests.get(self.url,headers=self.send_headers)
#获取返回信息的文本部分
resp_text = resp.text
#利用BS库对文本部分进行html解析,并返回解析后的界面
soup = BeautifulSoup(resp_text, "html.parser")
return soup
else:
resp = requests.get(url, headers=self.send_headers)
resp_text = resp.text
soup = BeautifulSoup(resp_text, "html.parser")
return soup
def get_main_info(self, url):
"""
获取url列表页面能获取主要信息,不打开各个书的独立页面,
主要信息包括:书的所属类别,作者国家,书名,每本书的索引url,书的作者,书的评分,书的简介,书的页数
:return: 各个主要信息的存储列表
"""
#分别为,书类别,国家,作者和简介的正则表示式
book_class_key = re.compile(": (\D*)")
book_nation_key = re.compile("\[(\D*?)\]")
book_writer_key1 = re.compile("^(\D*?)/")
book_writer_key2 = re.compile("](\D*)$")
book_comment_key = re.compile(r"(\S*)
")
#创建存储主要信息的列表:因为书名是固定的,一个大页面是一个类别,所以只需要返回一次,不需要列表存储
book_names = []
book_pages = []
book_nations = []
book_writers = []
book_comments = []
book_scores = []
#对url列表进行遍历并操作
#urls = self.generate_urls()
#为了防止耦合,最好一个函数只操作一个页面,在主函数进行对这个函数的遍历操作
resp = requests.get(url, headers=self.send_headers) #和上面一样的操作,向url发送get请求
resp_text = resp.text #获取返回的文本信息
soup = BeautifulSoup(resp_text, "html.parser") #利用BS库对html格式的文本信息进行解析
# 获取图书类别
book_class = soup.find("h1").get_text(strip=True)
book_class = book_class_key.findall(book_class)
# 获取书名
for a in soup.find_all("a"):
try:
# 获取书名
res = a.get("title")
# 获取对应的内层网站
res_url = a.get("href")
# 获取每本书对应的独立页面url
if res != None:
book_names.append(res)
book_pages.append(res_url)
except:
pass
"""
获取书的作者和作者国籍,因为非中国籍的形式为[国家]作者,而中国籍作者在作者名前没有[]
所以我们用两个正则表达式分别检索,但是少数作者即使不为中国籍,也没有加[],此类我把这类数据当作脏数据
为了尽可能的修正这种数据带来的影响,设置判定条件为,没有[]且作者名小于五个字,为中国作者
"""
for nation in soup.find_all("div", attrs={"class": "pub"}):
nn = nation.get_text().strip()
# print(nn)
book_writer = book_writer_key1.findall(nn)[0]
if ']' in book_writer:
book_writers.append(book_writer_key2.findall(book_writer)[0].strip())
else:
book_writers.append(book_writer)
try:
bn = book_nation_key.findall(nn)
if bn == [] and len(book_writer) < 5: #中国籍作者的判定条件
book_nations.append("中")
elif bn != []:
# print(bn)
book_nations.append(bn[0])
else:
book_nations.append("日")
except:
book_nations.append("中")
#获取书籍简介
for comment in soup.find_all("div", attrs={"class": "info"}):
if comment.find_all("p") == []:
book_comments.append("无简介")
else:
book_comments.append(comment.find_all("p")[0].get_text())
#获取书籍评分
for score in soup.find_all("span", attrs={"class": "rating_nums"}):
book_scores.append(score.get_text())
return book_names, book_pages, book_class*20, book_writers, book_nations, book_comments, book_scores
def get_page_numbers(self, urls):
"""
从每个图书的独立页面中获取数据,目前只获取了页数数据
:param urls: 从get_main_info中生成的图书独立页面url列表
:return: 对应图书的页数数据
"""
book_pagesnumber = []
print("****开始获取页数信息****")
for url in tqdm(urls):
rrr = requests.get(url, headers=self.send_headers)
rtext = rrr.text
in_soup = BeautifulSoup(rtext, 'html.parser')
# print(in_soup.text)
page_num = re.compile(r"页数: (\d*)").findall(in_soup.text)
#有可能有的书缺失页数信息,遇上此类情况全部将页数设置为0
if page_num == []:
book_pagesnumber.append(0)
else:
book_pagesnumber.extend(page_num)
return book_pagesnumber
def begin_crawl(self):
"""
类的“主函数”只需要执行这个函数就可以完成爬虫功能
:return: 所有的信息列表
"""
sum_book_names = []
sum_book_urls = []
sum_book_class = []
sum_book_writers = []
sum_book_nations = []
sum_book_comments = []
sum_book_scores = []
sum_book_pages = []
urls = self.generate_urls() #生成要爬取的所有页面的url地址
print("****开始爬取****")
for url in tqdm(urls):
book_names, book_urls, book_class, book_writers, book_nations, book_comments, book_scores = self.get_main_info(url)
book_pages = self.get_page_numbers(book_urls)
sum_book_names.extend(book_names)
sum_book_urls.extend(book_urls)
sum_book_class.extend(book_class)
sum_book_writers.extend(book_writers)
sum_book_nations.extend(book_nations)
sum_book_comments.extend(book_comments)
sum_book_scores.extend(book_scores)
sum_book_pages.extend(book_pages)
return sum_book_names, sum_book_urls, sum_book_class, sum_book_writers, sum_book_nations, sum_book_comments, sum_book_scores, sum_book_pages
def write2csv(self):
"""
将爬取结果写入csv文件中
:return: 无返回值
"""
name, url, book_class, writer, nation, comment, score, pages = self.begin_crawl()
info_df = pd.DataFrame(columns=["name", "url", "class", "writer", "nation", "comment", "score", "pages"])
info_df["name"] = name
info_df["url"] = url
info_df["class"] = book_class
info_df["writer"] = writer
info_df["nation"] = nation
info_df["comment"] = comment
info_df["score"] = score
info_df["pages"] = pages
info_df.to_csv(f"{book_class[0]}.csv", header=None, encoding="utf_8_sig")
if __name__ == '__main__':
db_crawler = douban_crawler(url, 5)
db_crawler.write2csv()
推荐资料:
request库的用法
BeautifulSoup库的用法
tqdm库相关
pandas入门操作