爬取某付费网站文档保存为html文件

import requests
import re
import os
import parsel


html_str = """



    
    Document


{article}


"""
html_filename = 'html\\'
if not os.path.exists(html_filename):
    os.mkdir(html_filename)
url = 'https://www.chinawenwang.com/zlist-66-1.html'  #文章列表页的url地址
headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
response = requests.get(url=url,headers=headers)

href = re.findall('

',response.text) # 提取文章的url地址 for link in href: response_1 = requests.get(url=link,headers=headers) selector = parsel.Selector(response_1.text) title = selector.css('.content-page-header-div h1::text').get() content = selector.css(('.content-page-main-content-div')).get() article = html_str.format(article = content) with open(html_filename+title+'.html',mode='w',encoding='utf-8') as f: f.write(article) print(title)

结果展示:

爬取某付费网站文档保存为html文件_第1张图片

你可能感兴趣的:(python,爬虫)