题目:利用Selenium和lxml来获取到豆瓣阅读( 出版 - 豆瓣阅读)中的书籍信息,一共爬取20页即可,主要包括书名、作者、简介、书本信息、原价、折扣价等,并将其以csv格式进行保存文件。
思路:利用Selenium获取页面信息,观察每一页网址的变化进行链接修改以获取下一页内容,利用面向对象思维进行。
答案:
import time
import lxml.html
import re
from selenium import webdriver
class DouBanSpider(object):
def __init__(self):
# 网址
self.base_url = "https://read.douban.com/category/1?page={}"
def get_url_list(self):
url_list = []
for page in range(1, 20):
# 网址拼接
book_url = self.base_url.format(page)
url_list.append(book_url)
return url_list
def parse_html(self, url):
dirver = webdriver.Chrome()
dirver.get(url)
time.sleep(3)
# 获得网页源码
html_code = dirver.page_source
# print(html_code)
dirver.quit()
return html_code
def catch_book_infos(self, temp_content):
metree = lxml.html.etree
book_parser = metree.HTML(temp_content, metree.HTMLParser())
li_list = book_parser.xpath("//div[@class='section-works']/ul[@class='works-list']/li")
# print(len(li_list))
book_infos_list = []
for li_element in li_list:
book_infos = []
# 书名
book_name = li_element.xpath(".//span[@class='title-text']//text()")
if len(book_name) > 0:
book_infos.append(book_name[0])
# 作者
author = li_element.xpath(".//div[@class='author']/a[1]/span/span/text()")
if len(author) > 0:
book_infos.append(author[0])
# 简介
introduce = li_element.xpath(".//div[@class='intro']/span//text()")
if len(introduce) > 0:
intro_infos1 = re.sub(r"\s", "", introduce[0])
intro_infos = re.sub(",", ",", intro_infos1)
book_infos.append(intro_infos)
# print(intro_infos)
# 书本信息
book_sticky = li_element.xpath(".//div[@class='extra-info']//text()")
if len(book_sticky) > 0:
text = ",".join(book_sticky)
book_infos.append(text)
# print(text)
# 价格
sale_list = li_element.xpath(".//span[@class='sale']/span//text()")
if len(sale_list) > 2:
before_sale = sale_list[1]
now_sale = sale_list[2]
# print(before_sale)
# print(now_sale)
else:
now_sale = 0
before_sale = sale_list[1]
book_infos.append(str(before_sale))
book_infos.append(str(now_sale))
book_infos_list.append(book_infos)
# print(book_infos_list)
return book_infos_list
def save_book_infos_by_csv(self, datas):
with open("./豆瓣信息.csv", "a", encoding="utf-8") as writer:
for book_info in datas:
res = ",".join(book_info) + "\n"
writer.write(res)
def run(self):
with open("./豆瓣信息.csv", "w", encoding="utf-8") as writer:
writer.write("书名,作者,简介,书本信息,原价,折扣价\n")
book_url_list = self.get_url_list()
index = 1
for url in book_url_list:
html_content = self.parse_html(url)
book_data = self.catch_book_infos(html_content)
self.save_book_infos_by_csv(book_data)
print("正在爬取并保存第%d页豆瓣书籍信息......" % index)
index += 1
print("全部豆瓣书籍网页数据保存成功!!!!!")
def main():
book = DouBanSpider()
book.run()
if __name__ == '__main__':
main()
运行结果: