Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址

直接上代码:

脱敏后自用的py采集代码,

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Andy
@file:xxx.py
@time:下午05:50
@desc:采集的文章数据进博客
"""
import os
import re
import time
import requests
from bs4 import BeautifulSoup, SoupStrainer
from requests.exceptions import RequestException
from hashlib import md5
from urllib.parse import urlparse
import urllib

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}


def get_content():
    url = 'http://ask.xxxx.com/question/xxxx'  # url
    response = requests.get(url, headers=headers).text.replace('', '')
    soup = BeautifulSoup(response, 'lxml')
    # div = soup.select('#aw-mod-body ueditor-p-reset')
    pattern = re.compile('(.*?)', re.S)
    p = soup.find_all('a')
    for item in p:
        # print(str(item))
        result = re.findall(pattern, str(item))
        if result:
            # print(result)
            for i in result:
                url, name = i
                # print(i)
                yield {
                    'url': url,
                    'name': name
                }


def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print(path+' 目录已存在')
        return False

def getUrl(html):
    #patterncss = '

 来源:http://www.shanhubei.com/archives/2491.html

你可能感兴趣的:(编程开发,python,css,javascript)