from selenium import webdriver
import requests
import os
import time
from lxml import etree
root_dir = 'douban/image'
if not os.path.exists(root_dir):
os.mkdir(root_dir)
# 浏览器
driver = webdriver.PhantomJS()
def spider(page):
base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'%(page*15)
# 访问
driver.get(base_url)
time.sleep(3)
# file_name = root_dir + '/%s.png'%(page)
# driver.save_screenshot(file_name)
# 页面内容
# print(driver.page_source)
# 解析
content_parser(driver.page_source)
# 解析
def content_parser(content):
tree = etree.HTML(content)
books = tree.xpath('//div[@class="item-root"]')
# 遍历列表,获取每本书的信息
for book in books:
# 图片
book_img = book.xpath('./a/img/@src')
if book_img != []:
book_img = book_img[0]
# print(book_img)
# 书名
book_name = book.xpath('.//div[@class="detail"]//a')
if book_name != []:
book_name = book_name[0].text
# print(book_name)
# 书的链接
book_src = book.xpath('.//a/@href')
# print(book_src)
if book_src != []:
book_src = book_src[0]
# print(book_src)
# 评分
book_score = book.xpath('.//div[@class="rating sc-bwzfXH hxNRHc"]/span[@class="rating_nums"]')
if book_score != []:
book_score = book_score[0].text
# print(book_score)
# 评价人数
evaluation_number = book.xpath('.//div[@class="rating sc-bwzfXH hxNRHc"]/span[@class="pl"]')
if evaluation_number != []:
evaluation_number = evaluation_number[0].text
# print(evaluation_number)
# 作者+出版社+价钱
author = book.xpath('.//div[@class="meta abstract"]')
if author != []:
author = author[0].text
# print(author)
print(book_name,author,book_img,book_score,book_src,evaluation_number)
if __name__ == '__main__':
for i in range(11):
spider(i)