爬贴吧——(1)先爬一下贴吧的帖子列表

import requests
from lxml import etree

class TiebaSpider(object):
    def __init__(self, tieba_name): # 初始化需要用到的变量
        self.tieba_name = tieba_name
        self.url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---/m?kw="+tieba_name+"&lp=7202"
        self.headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"}

    def parse_url(self, url):   # 发送请求,响应首页内容
        rp = requests.get(url, headers=self.headers)
        return rp.content.decode()

    def get_content_list(self, html_str):   # 清洗数据
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[contains(@class,'i')]")
        href_list = []
        for div in div_list:
            item = {}
            item["title"] = div.xpath("./a/text()") if len(div.xpath("./a/text()"))>0 else None
            item["href"] = div.xpath("./a/@href") if len(div.xpath("./a/@href"))>0 else None
            href_list.append(item)
        return href_list

    def save_content(self, href_list):
        file_name = self.tieba_name + ".txt"
        with open(file_name, "a") as f:
            f.write(href_list)
            f.write("\n")


    def run(self):
        # 1,获得开始url
        # 2,发送请求获得内容
        html_str = self.parse_url(self.url)
        href_list = self.get_content_list(html_str)
        self.save_content(href_list)
        # 3,解析内容
        # 4,保存内容

if __name__ == '__main__':
    tieba_spider = TiebaSpider("做头发")
    tieba_spider.run()

更新中…

你可能感兴趣的:(学习)