十二. 爬虫实战(Xpath)- 起点中文网的作品信息

爬取网址:https://www.qidian.com/all?page=1 (选取前100页)
爬取信息:小说名,作者ID,小说类型,完成情况,摘要,字数

import requests
from lxml import etree
import time
import xlwt

def get_info(url):
    r = requests.get(url)
    html = etree.HTML(r.text)
    books = html.xpath('//li[@data-rid]')
    for book in books:
        name = book.xpath('div/h4/a/text()')[0]
        id = book.xpath('div/p/a[1]/text()')[0]
        type1 = book.xpath('div/p/a[2]/text()')[0]
        type2 = book.xpath('div/p/a[3]/text()')[0]
        type = type1 + '.' + type2
        complete = book.xpath('div/p/span/text()')[0]
        summary = book.xpath('div/p[@class="intro"]/text()')[0].strip()
        # summary = book.xpath('div/p[2]/text()')[0].strip()   #等价于上一句
        word = book.xpath('div[2]/p[3]/span/text()')[0].strip('万字')
        info_list = [name,id,type,complete,summary,word]
        info_lists.append(info_list)

if __name__ == '__main__':
    urls = ['https://www.qidian.com/all?page={}'.format(i) for i in range(1,101)]
    info_lists =[]
    for url in urls:
        get_info(url)
        time.sleep(1)
        
    head = ['小说名','作者ID','小说类型','完成情况','摘要','字数(万字)']
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('qidian') 
    for x in range(len(head)):
        sheet.write(0,x,head[x])   #写入表头信息
    
    i = 1                          
    for info_list in info_lists:
        j = 0
        for info in info_list:
            sheet.write(i,j,info)  #写入爬取信息
            j += 1
        i += 1
    book.save('F://qidian.xls')

你可能感兴趣的:(十二. 爬虫实战(Xpath)- 起点中文网的作品信息)