python爬虫--爬取豆瓣读书----内容数据

import time,os
from lxml import etree
from excel_wirte_utils.wirte_to_excel import write_to_excel,write_to_excel_append
import xlrd
from xlutils.copy import copy
import requests
import xlwt
from selenium import webdriver
def get_xpath_by_selenium(url):
    #1创建一个dirver
    driver = webdriver.PhantomJS()
    #2请求url
    driver.get(url)
    #3等待
    time.sleep(1)
    #4获取页面源代码
    html_str = driver.page_source
    return html_str
def get_text(text):
    if text:
        return text[0]
    return ''
# def write_to_excel_append(infos,filename):
#     #要写如数据的行数
#     index = len(infos)
#     #打开工作薄
#     work_book = xlrd.open_workbook(filename)
#     #获取工作薄中所有sheet表单
#     sheets = work_book.sheet_names()
#     #获取第一个表单
#     work_sheet = work_book.sheet_by_name(sheets[0])
#     #获取已经写入的行数
#     old_rows = work_sheet.nrows
#     #将xlrd对象转化成xlwt对象---为了写入
#     new_work_book = copy(work_book)
#     #获取第一个sheet写入
#     new_sheet = new_work_book.get_sheet(0)
#     keys = [key for key in infos[0].keys()]
#     i = old_rows
#     for item in infos:
#         for j in range(len(keys)):
#             new_sheet.write(i,j,item[keys[j]])
#         i+=1
#     new_work_book.save(filename)
#     print('追加成功!')
#
#
# def write_to_excel(infos,filename):
#     # print(infos)
#     work_book = xlwt.Workbook(encoding='utf-8')
#     sheet = work_book.add_sheet('python_book')
#     #设置表头
#     head =[]
#     for i in infos[0].keys():
#         head.append(i)
#     #写表头
#     for i in range(len(head)):
#         sheet.write(0,i,head[i])
#     #写内容
#     i = 1
#     for item in infos:
#         for j in range(len(head)):
#             sheet.write(i,j,item[head[j]])
#         i+=1
#     #保存
#     work_book.save(filename)
#     print('写入成功!')
def parse_page(div_list):
    books = []
    # print(div_list)
    for div in div_list:
        # 书名
        name = get_text(div.xpath('.//div[@class="title"]/a/text()'))
        # print(name)
        # 评分
        scores = get_text(div.xpath('.//span[@class="rating_nums"]/text()'))
        # print(scores)
        info = get_text(div.xpath('.//div[@class="meta abstract"]/text()'))
        # print(infos)
        infos = info.split(r'/')
        # print(infos)

        if infos[0] != '':
            # 作者
            author = infos[0]
            # 出版社
            publish = infos[-3]
            # 出版日期
            date = infos[-2]
            # 价格
            price = infos[-1]
            # 详情页链接
            # print(publish,date,price)
            detail_url = get_text(div.xpath('.//div[@class="title"]/a/@href'))
            # print(detail_url)
            item = {}
            item['name'] = name
            item['scores'] = scores
            item['author'] = author
            item['publish'] = publish
            item['price'] = price
            item['date'] = date
            item['detail_url'] = detail_url
            print(item)
            books.append(item)
    if not os.path.exists('./python图书.xls'):
        write_to_excel(books, 'python图书.xls','python_book')
    else:
        write_to_excel_append(books,'python图书.xls')
def main():
    base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Cookie': 'bid="z2fBE2XC4o8"; ap_v=0,6.0; _pk_ses.100001.3ac3=*; __yadk_uid=khcI2bHdOlrTGo58twthlqvscik5uesV; __utma=30149280.671637084.1576913620.1576913620.1576913620.1; __utmc=30149280; __utmz=30149280.1576913620.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; _pk_id.100001.3ac3=e81d126bd8f23962.1576913619.1.1576913760.1576913619.; __utmb=30149280.5.10.1576913620',
    }
    # for i in range(1):
    i= 0
    while True:
        # response = requests.get(base_url %(i*15),headers=headers)
        # print(response.text)
        html_str = get_xpath_by_selenium(base_url %(i*15))
        # print(html_str)
        tree = etree.HTML(html_str)
        div_list = tree.xpath('//div[@id="root"]/div/div[2]/div/div/div')
        if  not div_list:
            break
        parse_page(div_list)
        i+=1
if __name__ == '__main__':

    main()

你可能感兴趣的:(爬虫)