import requests
import lxml.html
import re
import time
import os
import random
from django.views.decorators.csrf import csrf_exempt
# user_agent列表,每次执行requests请求都随机使用该列表中的user_agent,避免服务器反爬
user_agent_list = [
# Windows / Firefox 58
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0",
# Linux / Firefox 58
"Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0",
# Mac OS X / Safari 11.0.2
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_2) AppleWebKit/603.1.13 (KHTML, like Gecko) Version/11.0.2 Safari/603.1.13",
# Windows / IE 11
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
# Windows / Edge 16
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/16.16299.15.0",
# Windows / Chrome 63
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
# Android Phone / Chrome 63
"Mozilla/5.0 (Linux; Android 7.0; SM-G935P Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36",
# Android Tablet / Chrome 63
"Mozilla/5.0 (Linux; Android 4.4.4; Lenovo TAB 2 A10-70L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Safari/537.36",
# iPhone / Safari 11.1.1
# "Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/11.1.1 Mobile/14E304 Safari/602.1",
# iPad / Safari 11.1.1
"Mozilla/5.0 (iPad; CPU OS 11_1_1 like Mac OS X) AppleWebKit/603.3.3 (KHTML, like Gecko) Version/11.1.1 Mobile/14G5037b Safari/602.1"]
requests_header = {
"Host": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept": "",
"Accept-Language": "zh-CN,en-US;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "",
"Connectionv": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
PICTURE_PATH = "f:/meitulu"
def download_page_html(url):
phtml = None
page = None
try:
requests_header["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
requests_header["Host"] = "www.meitulu.com"
requests_header["Referer"] = url
# 选择一个随机的User-Agent
requests_header["User-Agent"] = random.choice(user_agent_list)
# print(requests_header["User-Agent"])
# print(requests_header)
page = requests.get(url=url, headers=requests_header,
timeout=15) # 请求指定的页面
# print(page.encoding)
if page.encoding == "ISO-8859-1":
page.encoding = "utf-8" # "gb2312" # 转换页面的编码为gb2312(避免中文乱码)
phtml = page.text # 提取请求结果中包含的html文本
# print("requests success")
# page.close() # 关闭requests请求
except requests.exceptions.RequestException as e:
print("requests error:", e)
phtml = None
# if page != None:
# page.close()
finally:
if page != None:
page.close()
return phtml
@csrf_exempt
def download_picture(url, page, dir):
try:
picdir = "{0}/{1}".format(PICTURE_PATH, dir) # 构造图片保存路径
print(picdir)
if os.path.exists(picdir) != True:
os.makedirs(picdir) # 如果指定的文件夹不存在就递归创建
pic_name = url.split("/")[-1] # 用图片链接中最后一个/后面的部分作为保存的图片名
pic_full_name = "{0}/{1}".format(picdir, pic_name)
# print("save picture to :", pic_full_name)
requests_header["Accept"] = "image/webp,*/*"
requests_header["Host"] = "mtl.ttsqgs.com"
requests_header["Referer"] = page
response = requests.get(
url, headers=requests_header, timeout=15) # 获取的文本实际上是图片的二进制文本
imgdata = response.content # 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本
if len(imgdata) > (5*1024): # 只保存大于5k的图片
with open(pic_full_name, 'wb') as f:
f.write(imgdata) # 把图片数据写入文件。with语句会自动关闭f
print("save picture to :", pic_full_name)
else:
print("picture size too small")
response.close()
except:
print("download piccture {0} error".format(url))
# 获取所有需要爬取的页面数
def get_page_list_num(tree):
page_all_num = 0
page_list_num = 0
try:
# 使用xpath选择器选择html中指定的元素。
page_all_num = tree.xpath('//div[@id="pages"]/a/text()')[0]
print(page_all_num)
page_all_num = str(page_all_num)
# print(page_all_num)
page_all_num = re.sub(r"\D", "", page_all_num) # 把非数字字符串替换为空
page_all_num = int(page_all_num) # 转化为整数
print("max_page_number:", page_all_num)
except:
print("get page number error")
page_all_num = 0
finally:
# 向上取整, 每页60个图集, 用总图集数除以60求出页面数
page_list_num = page_all_num // (15*4)
if(page_all_num % (15*4)) != 0:
page_list_num += 1
return page_list_num, page_all_num
def get_page_album_list(tree): # 获取页面中的图片集列表(编号)
page_album_list = []
page_album_list = tree.xpath('//ul[@class="img"]/li/a/@href')
for i in range(len(page_album_list)):
page_album_list[i] = page_album_list[i].split(
"/")[-1] # 提取最后一个 / 之后的内容 "17748.html"
page_album_list[i] = re.sub(
r"\D", "", page_album_list[i]) # 把非数字字符串替换为空, 提取出数字部分
# print(page_album_list)
return page_album_list
def get_page_title_list(tree): # 获取页面中的图片集标题
page_title_list = []
page_title_list = tree.xpath('//ul[@class="img"]/li/a/img/@alt')
# print(page_title_list)
return page_title_list
def get_page_jpgnum_list(tree): # 获取页面中的图片数目列表
page_jpgnum_list = []
page_jpgnum_list = tree.xpath('//ul[@class="img"]/li/p[1]/text()')
for i in range(len(page_jpgnum_list)):
page_jpgnum_list[i] = re.sub(
r"\D", "", page_jpgnum_list[i]) # 把非数字字符串替换为空, 提取出数字部分
page_jpgnum_list[i] = int(page_jpgnum_list[i])
# print(page_jpgnum_list)
return page_jpgnum_list
REQUEST_URL0 = "https://www.meitulu.com/t/siwayouhuo/"
REQUEST_URL1 = "https://www.meitulu.com/t/siwayouhuo/{0}.html"
REQUEST_ALBUM_URL = "https://www.meitulu.com/item/{0}.html"
REQUEST_JPEG_URL = "https://mtl.ttsqgs.com/images/img/{0}/{1}.jpg"
if __name__ == "__main__":
requests_url = REQUEST_URL0
index = 0
page_list_num = 0
page_all_num = 0
print("requests_url :", requests_url)
page_html_list = download_page_html(requests_url) # 下载当前页面
if(page_html_list == None):
exit()
# print(page_html_list)
tree = lxml.html.fromstring(page_html_list)
page_list_num, page_all_num = get_page_list_num(tree) # 获取页面数
print(page_list_num, page_all_num)
for idx in range(page_list_num):
if(idx == 0):
requests_url = REQUEST_URL0
else:
requests_url = REQUEST_URL1.format(idx+1)
print(requests_url)
page_html_list = download_page_html(requests_url) # 下载当前页面
if(page_html_list == None):
continue
tree = lxml.html.fromstring(page_html_list)
page_album_list = get_page_album_list(tree)
print(idx, len(page_album_list))
page_title_list = get_page_title_list(tree)
print(idx, len(page_title_list))
page_jpgnum_list = get_page_jpgnum_list(tree)
print(idx, len(page_jpgnum_list))
if(len(page_album_list) != len(page_title_list)) or \
(len(page_album_list) == 0) or (len(page_title_list) == 0) or \
(len(page_jpgnum_list) == 0):
continue
for lst in range(len(page_album_list)):
for img in range(page_jpgnum_list[lst]):
jpeg_url = REQUEST_JPEG_URL.format(page_album_list[lst], img+1)
page_url = REQUEST_ALBUM_URL.format(page_album_list[lst])
jpg_title = page_title_list[lst]
# print(requests_url)
print("Download [{0}] on [{1}], title[{2}]".format(jpeg_url, page_album_list[lst], jpg_title))
download_picture(jpeg_url, page_url, jpg_title)
web_sleep = random.randint(1, 5) # 延时一个随机值,避免被服务器反爬
# print("waiting {0} seconds".format(web_sleep))
time.sleep(web_sleep)