第二课作业

import requests

import time

from lxml import etree

import xlwt

headers = {

    'User-Aegnt':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

}

all_info_lists = []

def get_info(url,headers=headers):

    res = requests.get(url)

    html = etree.HTML(res.text)

    infos = html.xpath('//div[@class="col1"]/div')

    for info in infos:

        try:

            id = info.xpath('div[1]/a[2]/h2/text()')[0]

        except IndexError:

            pass

            id = info.xpath('div[1]/span[2]/h2/text()')[0]

        content1 = info.xpath('a[1]/div/span[1]')[0]

        content = content1.xpath('string(.)').strip()

        laugh = info.xpath('div[2]/span[1]/i/text()')[0]

        comment = info.xpath('div[2]/span[2]/a/i/text()')[0]

        info_list = [id,content,laugh,comment]

        all_info_lists.append(info_list)

if __name__ == '__main__':

    book = xlwt.Workbook(encoding='utf-8')

    sheet = book.add_sheet('sheet1')

    header =['id', 'content', 'laugh', 'comment']

    for t in range(len(header)):

        sheet.write(0, t, header[t])

    urls = ['https://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,14)]

    for url in urls:

        get_info(url)

        time.sleep(2)

    i = 1

    for list in all_info_lists:

        j = 0

        for data in list:

            sheet.write(i, j, data)

            j += 1

        i += 1

    book.save('C:/Users/madin/Desktop/test.xls')

你可能感兴趣的:(第二课作业)