Python抓取网页内容并输出PDF文件

环境:pytho3.5.1

import requests
import os
import time
import random
import re
import pdfkit
from bs4 import BeautifulSoup
from lxml import html

def get_text(url):
    # 获取url的内容,调用频率极高
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text

def clean_str(string):
    # 创建文件时候去掉特殊字符,防止创建失败
    return re.sub(r'\n|\s', '', string)

def analysis_text(url):
    # 核心方法,获取需要的html信息,以方便重新组合形成新的html文件,注意是带上标签的
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
    text = get_text(url)
    soup = BeautifulSoup(text, 'html.parser')

    try:
        title = soup.find_all('h1')[0]
        main_text = soup.find_all(id="arc-body")[0]
        file_name = re.findall('

(.*?)

', text)[0] file_name = clean_str(file_name) imgs = re.findall('

LINK1
LINK2

你可能感兴趣的:(Python抓取网页内容并输出PDF文件)