笔记回顾

# -*- coding:utf-8 -*-

import requests
from lxml import etree
import jso

class QiushiSpider():

    def __init__(self):
        self.url_start = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

    def get_url_list(self): # 构造url list
        url_list = [self.url_start.format(i) for i in range(1,14)]
        return url_list

    def parse_url(self,url):
        print("正在爬取:",url)
        response = requests.get(url,headers=self.headers)
        return response.content.decode()

    def get_content_list(self,html_str): # 提取数据
        html = etree.HTML(html_str)
        #1.分组
        li_list = html.xpath("//*[@id='content']//li")
        print(li_list)
        content_list = []

        for li in li_list:
            item = {}
            item["name"] = li.xpath("//*[@class='recmd-name']/text()")[0] if len(li.xpath(".//*[@class='recmd-name'/text()"))>0 else None
            item["title"] = li.xpath("//*[@class='recmd-content']/text()")[0] if len(li.xpath(".//*[@class='recmd-content'/text()"))>0 else None
            item["content_url"] = li.xpath("//*[@class='recmd-content']/@href")[0] if len(li.xpath(".//*[@class='recmd-content'/@href"))>0 else None
            item["haoxiao"] = li.xpath("//*[@class='recmd-num']/span[1]/text()")[0] if len(li.xpath(".//*[@class='recmd-num']/span[1]/text()"))>0 else None
            item["pinglun"] = li.xpath("//*[@class='recmd-num']/span[4]/text()")[0] if len(li.xpath(".//*[@class='recmd-num']/span[4]/text()"))>0 else None
            content_list.append(item)
        return content_list

    def save_conten_list(self,content_list): # 保存数据
        with open("choushibaike.txt","a",encoding="utf-8") as f:
            for content in content_list:
                f.write(json.dump(content,ensure_ascii=False))
                f.write("\n")
        print("保存成功")


    def run(self): # 实现主要逻辑
        #1.根据url地址的规律,构造url列表
        url_list = self.get_url_list()
        #2.发送请求
        for url in url_list:
            html_str = self.parse_url(url)
        #3.提取数据
        content_list = self.get_content_list(html_str)
        #4.保存数据
        self.save_conten_list(content_list)

if __name__ == '__main__':
    choushbaike = QiushiSpider()
    choushbaike.run()

你可能感兴趣的:(笔记回顾)