python 爬取网站 下载网站资源

#!/usr/bin/python3
# -*- coding: utf-8 -*-


import requests  # 发送http请求
from bs4 import BeautifulSoup  # 解析html
import os
from urllib import parse


class p_web(object):
    def __init__(self, web_url, save_path, headers, is_cover=True, include_link=[]):
        self.img = ['jpg', 'png', 'gif']
        self.video = ['mp4']
        self.audio = ['mp3']  # 音频
        self.ignore_link = ['javascript', '#']  # 忽略链接
        self.link = set()  # 所有链接
        self.visit_link = set()  # 已经访问过的链接
        self.download_link = set()  # 已经下载过的链接

        self.web_url = web_url # 访问url
        self.save_path = save_path # 存储路径
        self.headers = headers
        self.is_cover = is_cover  # 是否覆盖 True 覆盖 False 不覆盖
        self.include_link = include_link  # 包含链接 4767 mm131


    '''
    解析url
    @param web_url string 网站url
    @return dict
    '''
    def parse_url(self, web_url):
        return parse.urlparse(web_url)

    '''
        创建文件夹
        @param path string 路径
        @return none
    '''
    def create_folder(self, path):
        if not os.path.isdir(path):
            os.makedirs(path)

    '''
    找到字符串最后出现的位置
    @param m_str string 主字符串
    @param c_str string 子字符串
    @return int
    '''
    def find_end(self, m_str, c_str):
        return m_str.rfind(c_str)

    '''
    得到页面上的所有链接
    @param web_url string 网站url
    @param headers dict 请求头
    @return none
    '''
    def get_link_down(self, web_url, headers):
        if web_url in self.visit_link:
            return
        print('访问链接:', web_url)
        self.visit_link.add(web_url)
        self.link.add(web_url)
        self.download_file(web_url, headers)

        parse_url = self.parse_url(web_url)  # 解析url
        scheme = parse_url.scheme
        netloc = parse_url.netloc
        path = parse_url.path

        response = requests.get(url=web_url, headers=headers)  # 获取html内容
        soup = BeautifulSoup(response.content, "html.parser", from_encoding="iso-8859-1")  # 解析html内容

        src = soup.find_all(lambda tag: tag.has_attr('src'))  # 找到所有 有src 属性 的标签
        href = soup.find_all(lambda tag: tag.has_attr('href'))  # 找到所有 有href 属性 的标签

        if src or href:
            root_dir = scheme + '://' + netloc  # 根目录
            now_dir = scheme + '://' + netloc + path[0:self.find_end(path, '/')] + '/'  # 当前目录

        # 过滤/拼接 链接
        for i in src:
            for j in self.ignore_link:  # 忽略链接
                if i['src'] == '' or i['src'] == '/' or i['src'].find(j) > -1:
                    break
            else:
                include_flag = True
                if self.include_link:
                    for k in self.include_link:  # 包含链接
                        if i['src'].find(k) > -1:
                            include_flag = False
                            break
                else:
                    include_flag = False

                if include_flag:
                    print('src链接:%s 未包含:%s' % (i['src'], self.include_link))
                    continue

                if i['src'].find('//') > -1 and i['src'].find('http') == -1:
                    i['src'] = 'http:' + i['src']
                if i['src'].find(netloc) == -1:
                    if i['src'].find('//') == -1:
                        if i['src'][0] == '/':
                            i['src'] = root_dir + i['src']
                        else:
                            i['src'] = now_dir + i['src']
                if i['src'].find(r'\"') > -1:
                    i['src'] = i['src'].replace(r'\"', '')
                elif i['src'].find(r"\'") > -1:
                    i['src'] = i['src'].replace(r"\\'", '')

                self.link.add(i['src'])
                headers['Referer'] = web_url
                self.download_file(i['src'], headers)

        for i in href:
            for j in self.ignore_link:  # 忽略链接
                if i['href'] == '' or i['href'] == '/' or i['href'].find(j) > -1:
                    break
            else:
                include_flag = True
                if self.include_link:
                    for k in self.include_link:  # 包含链接
                        if i['href'].find(k) > -1:
                            include_flag = False
                            break
                else:
                    include_flag = False

                if include_flag:
                    print('href链接:%s 未包含:%s' % (i['href'], self.include_link))
                    continue

                if i['href'].find('//') > -1 and i['href'].find('http') == -1:
                    i['href'] = 'http:' + i['href']
                if i['href'].find(netloc) == -1:
                    if i['href'].find('//') == -1:
                        if i['href'][0] == '/':
                            i['href'] = root_dir + i['href']
                        else:
                            i['href'] = now_dir + i['href']
                if i['href'].find(r'\"') > -1:
                    i['href'] = i['href'].replace(r'\"', '')
                elif i['href'].find(r"\'") > -1:
                    i['href'] = i['href'].replace(r"\\'", '')

                self.link.add(i['href'])
                headers['Referer'] = web_url
                self.download_file(i['href'], headers)

    '''
    下载html,css,js,img,video
    @param web_url  网站url
    @param path string 文件路径
    @param type string open模式
    @return none
    '''
    def download_file(self, web_url, headers):
        if web_url in self.download_link:
            return
        self.download_link.add(web_url)

        # 创建目录
        parse_url = self.parse_url(web_url)
        print(web_url)
        print(parse_url)

        netloc = parse_url.netloc
        path = parse_url.path

        if path.find('.') == -1:
            if len(path):
                if path[-1] == '/':
                    path += 'index.html'
                else:
                    path += '/index.html'
            else:
                path = '/index.html'

        if netloc.find(':') > -1:
            netloc = netloc.replace(':', '_')
        self.create_folder(self.save_path + netloc + path[0:self.find_end(path, '/')])

        file_path = self.save_path + netloc + path  # 路径
        # 有相同文件是否覆盖文件
        if not self.is_cover and os.path.isfile(file_path):
            return

        etx = parse_url.path.split('.')[-1]  # 后缀
        response = requests.get(url=web_url, headers=headers)  # 发起请求

        # 下载文件
        with open(file_path, "wb") as f:
            f.write(response.content)
            print('成功下载文件: %s' % web_url)

    '''主方法'''
    def main(self):
        # try:
        web_url = self.web_url
        headers = self.headers

        # 得到页面上的链接
        self.get_link_down(web_url, headers)

        while True:
            for i in self.link.symmetric_difference(self.visit_link):
                headers['Referer'] = i
                self.get_link_down(i, headers)

            if len(self.link) == len(self.visit_link):
                break

        print(self.link)
        # except Exception as e:
        #     print(e)


if __name__ == '__main__':
    '''
    更改访问链接记得更换 Referer,include_link
    '''
    # web_url = 'http://127.0.0.1/test/index.html'
    web_url = 'http://www.mm131.com/xinggan/4767.html' # ['4767']
    # web_url = 'http://www.mm131.com/xiaohua/901.html' # ['901']
    # web_url = 'http://www.mm131.com/chemo/1502.html' # ['1502']
    # web_url = 'https://www.mzitu.com/173845' # ['173845', '2019']
    # web_url = 'https://www.mzitu.com/170468' # ['170468', '2019']
    # web_url = 'http://www.fuqianla.net/'
    # web_url = 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1550545933794_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=哈哈'

    save_path = 'C:/Users/Administrator/Desktop/tmp/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400',
        'Referer': 'https://www.mzitu.com/'
    }

    is_cover = True # 是否覆盖 True 覆盖 False 不覆盖
    include_link = ['4767'] # 包含链接

    p_web(web_url, save_path, headers, is_cover, include_link).main()

 

你可能感兴趣的:(python)