用selenium爬取豆瓣关于Python前10页的书籍

from selenium import webdriver
import requests
import os
import time
from lxml import etree

root_dir = 'douban/image'
if not os.path.exists(root_dir):
    os.mkdir(root_dir)

# 浏览器
driver = webdriver.PhantomJS()

def spider(page):
    base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'%(page*15)

    # 访问
    driver.get(base_url)

    time.sleep(3)
    # file_name = root_dir + '/%s.png'%(page)
    # driver.save_screenshot(file_name)

    # 页面内容
    # print(driver.page_source)

    # 解析
    content_parser(driver.page_source)

# 解析
def content_parser(content):
    tree = etree.HTML(content)
    books = tree.xpath('//div[@class="item-root"]')
    # 遍历列表,获取每本书的信息
    for book in books:
        # 图片
        book_img = book.xpath('./a/img/@src')
        if book_img != []:
            book_img = book_img[0]
            # print(book_img)

        # 书名
        book_name = book.xpath('.//div[@class="detail"]//a')
        if book_name != []:
            book_name = book_name[0].text
            # print(book_name)

        # 书的链接
        book_src = book.xpath('.//a/@href')
        # print(book_src)
        if book_src != []:
            book_src = book_src[0]
            # print(book_src)

        # 评分
        book_score = book.xpath('.//div[@class="rating sc-bwzfXH hxNRHc"]/span[@class="rating_nums"]')
        if book_score != []:
            book_score = book_score[0].text
            # print(book_score)

        # 评价人数
        evaluation_number = book.xpath('.//div[@class="rating sc-bwzfXH hxNRHc"]/span[@class="pl"]')
        if evaluation_number != []:
            evaluation_number = evaluation_number[0].text
            # print(evaluation_number)

        # 作者+出版社+价钱
        author = book.xpath('.//div[@class="meta abstract"]')
        if author != []:
            author = author[0].text
            # print(author)

        print(book_name,author,book_img,book_score,book_src,evaluation_number)

if __name__ == '__main__':
    for i in range(11):
        spider(i)

 

你可能感兴趣的:(python)