#!/usr/bin/python3
# -*- coding: utf-8 -*-
import requests # 发送http请求
from bs4 import BeautifulSoup # 解析html
import os
from urllib import parse
class p_web(object):
def __init__(self, web_url, save_path, headers, is_cover=True, include_link=[]):
self.img = ['jpg', 'png', 'gif']
self.video = ['mp4']
self.audio = ['mp3'] # 音频
self.ignore_link = ['javascript', '#'] # 忽略链接
self.link = set() # 所有链接
self.visit_link = set() # 已经访问过的链接
self.download_link = set() # 已经下载过的链接
self.web_url = web_url # 访问url
self.save_path = save_path # 存储路径
self.headers = headers
self.is_cover = is_cover # 是否覆盖 True 覆盖 False 不覆盖
self.include_link = include_link # 包含链接 4767 mm131
'''
解析url
@param web_url string 网站url
@return dict
'''
def parse_url(self, web_url):
return parse.urlparse(web_url)
'''
创建文件夹
@param path string 路径
@return none
'''
def create_folder(self, path):
if not os.path.isdir(path):
os.makedirs(path)
'''
找到字符串最后出现的位置
@param m_str string 主字符串
@param c_str string 子字符串
@return int
'''
def find_end(self, m_str, c_str):
return m_str.rfind(c_str)
'''
得到页面上的所有链接
@param web_url string 网站url
@param headers dict 请求头
@return none
'''
def get_link_down(self, web_url, headers):
if web_url in self.visit_link:
return
print('访问链接:', web_url)
self.visit_link.add(web_url)
self.link.add(web_url)
self.download_file(web_url, headers)
parse_url = self.parse_url(web_url) # 解析url
scheme = parse_url.scheme
netloc = parse_url.netloc
path = parse_url.path
response = requests.get(url=web_url, headers=headers) # 获取html内容
soup = BeautifulSoup(response.content, "html.parser", from_encoding="iso-8859-1") # 解析html内容
src = soup.find_all(lambda tag: tag.has_attr('src')) # 找到所有 有src 属性 的标签
href = soup.find_all(lambda tag: tag.has_attr('href')) # 找到所有 有href 属性 的标签
if src or href:
root_dir = scheme + '://' + netloc # 根目录
now_dir = scheme + '://' + netloc + path[0:self.find_end(path, '/')] + '/' # 当前目录
# 过滤/拼接 链接
for i in src:
for j in self.ignore_link: # 忽略链接
if i['src'] == '' or i['src'] == '/' or i['src'].find(j) > -1:
break
else:
include_flag = True
if self.include_link:
for k in self.include_link: # 包含链接
if i['src'].find(k) > -1:
include_flag = False
break
else:
include_flag = False
if include_flag:
print('src链接:%s 未包含:%s' % (i['src'], self.include_link))
continue
if i['src'].find('//') > -1 and i['src'].find('http') == -1:
i['src'] = 'http:' + i['src']
if i['src'].find(netloc) == -1:
if i['src'].find('//') == -1:
if i['src'][0] == '/':
i['src'] = root_dir + i['src']
else:
i['src'] = now_dir + i['src']
if i['src'].find(r'\"') > -1:
i['src'] = i['src'].replace(r'\"', '')
elif i['src'].find(r"\'") > -1:
i['src'] = i['src'].replace(r"\\'", '')
self.link.add(i['src'])
headers['Referer'] = web_url
self.download_file(i['src'], headers)
for i in href:
for j in self.ignore_link: # 忽略链接
if i['href'] == '' or i['href'] == '/' or i['href'].find(j) > -1:
break
else:
include_flag = True
if self.include_link:
for k in self.include_link: # 包含链接
if i['href'].find(k) > -1:
include_flag = False
break
else:
include_flag = False
if include_flag:
print('href链接:%s 未包含:%s' % (i['href'], self.include_link))
continue
if i['href'].find('//') > -1 and i['href'].find('http') == -1:
i['href'] = 'http:' + i['href']
if i['href'].find(netloc) == -1:
if i['href'].find('//') == -1:
if i['href'][0] == '/':
i['href'] = root_dir + i['href']
else:
i['href'] = now_dir + i['href']
if i['href'].find(r'\"') > -1:
i['href'] = i['href'].replace(r'\"', '')
elif i['href'].find(r"\'") > -1:
i['href'] = i['href'].replace(r"\\'", '')
self.link.add(i['href'])
headers['Referer'] = web_url
self.download_file(i['href'], headers)
'''
下载html,css,js,img,video
@param web_url 网站url
@param path string 文件路径
@param type string open模式
@return none
'''
def download_file(self, web_url, headers):
if web_url in self.download_link:
return
self.download_link.add(web_url)
# 创建目录
parse_url = self.parse_url(web_url)
print(web_url)
print(parse_url)
netloc = parse_url.netloc
path = parse_url.path
if path.find('.') == -1:
if len(path):
if path[-1] == '/':
path += 'index.html'
else:
path += '/index.html'
else:
path = '/index.html'
if netloc.find(':') > -1:
netloc = netloc.replace(':', '_')
self.create_folder(self.save_path + netloc + path[0:self.find_end(path, '/')])
file_path = self.save_path + netloc + path # 路径
# 有相同文件是否覆盖文件
if not self.is_cover and os.path.isfile(file_path):
return
etx = parse_url.path.split('.')[-1] # 后缀
response = requests.get(url=web_url, headers=headers) # 发起请求
# 下载文件
with open(file_path, "wb") as f:
f.write(response.content)
print('成功下载文件: %s' % web_url)
'''主方法'''
def main(self):
# try:
web_url = self.web_url
headers = self.headers
# 得到页面上的链接
self.get_link_down(web_url, headers)
while True:
for i in self.link.symmetric_difference(self.visit_link):
headers['Referer'] = i
self.get_link_down(i, headers)
if len(self.link) == len(self.visit_link):
break
print(self.link)
# except Exception as e:
# print(e)
if __name__ == '__main__':
'''
更改访问链接记得更换 Referer,include_link
'''
# web_url = 'http://127.0.0.1/test/index.html'
web_url = 'http://www.mm131.com/xinggan/4767.html' # ['4767']
# web_url = 'http://www.mm131.com/xiaohua/901.html' # ['901']
# web_url = 'http://www.mm131.com/chemo/1502.html' # ['1502']
# web_url = 'https://www.mzitu.com/173845' # ['173845', '2019']
# web_url = 'https://www.mzitu.com/170468' # ['170468', '2019']
# web_url = 'http://www.fuqianla.net/'
# web_url = 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1550545933794_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=哈哈'
save_path = 'C:/Users/Administrator/Desktop/tmp/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400',
'Referer': 'https://www.mzitu.com/'
}
is_cover = True # 是否覆盖 True 覆盖 False 不覆盖
include_link = ['4767'] # 包含链接
p_web(web_url, save_path, headers, is_cover, include_link).main()