爬取盗墓笔记

前面做过的一个小项目,适合练手

import requests
import json
from bs4 import BeautifulSoup


def get_html():
    # 请求获得html
    url = 'http://seputu.com/'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"}
    response = requests.get(url, headers=headers)
    return response.text


def parse_html(html_str):
    # 解析html
    soup = BeautifulSoup(html_str, features='lxml')
    # print(soup.prettify())
    # print("-" * 100)
    content = []
    for mulu in soup.find_all(class_="mulu"):
        h2 = mulu.find('h2')
        if h2:
            # 获取标题
            h2_title = h2.string

            l = []
            for box in soup.find_all(class_='box'):
                for a in box.find_all('a'):
                    href = a.get('href')
                    box_title = a.string
                    l.append({'href': href, 'box_title': box_title})

            content.append({'title': h2_title, 'content': l})

    # 保存到本地为json文件
    with open('18-盗墓笔记.json', 'w', encoding='utf8') as fp:
        json.dump(content, fp=fp, indent=4)



def mian():
    html_str = get_html()
    parse_html(html_str)


if __name__ == '__main__':
    mian()

你可能感兴趣的:(Python爬虫)