(缺流程图)
import requests, random, time
from lxml import etree
from urllib import request
first = time.time()
def parse_html(text):
html = etree.HTML(text)
pics = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
# print(pics)
for img in pics:
# get获得url
img_url = img.get('data-original')
# print(img_url)
name = list(img.get('alt'))
name = [i for i in name if i not in '/|']
# name = list(filter(lambda x:x not in '/|', name))
new_name = ''.join(name)
try:
request.urlretrieve(img_url, r'D:\pycharm\projects\test\pic/{}.jpg'.format(new_name))
except:
raise
else:
print(f'正在下载{new_name}')
def parse_page(url):
headers_choice = [{
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50safari 5.1 – Windows'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)TT'}, {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},
{
'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
{
'User-Agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
{
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)Firefox 4.0.1 – MAC'}]
proxy_choice = [{'http': '54.214.52.181:80'}, {'http': '34.93.243.67:80'}, {'https': '119.27.170.46:8888'},
{'https': '58.220.95.80:9401'}]
headers = random.choice(headers_choice)
proxy = random.choice(proxy_choice)
try:
text = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
except requests.exceptions.ProxyError:
print('连接错误')
except UnicodeEncodeError:
print('编码更改')
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Cookie': '_agep=1601992588; _agfp=4b5864586b360b9b3511ad1fed0a73c1; _agtk=ad0fc9700101cd027f014ae151fe390a; Hm_lvt_2fc12699c699441729d4b335ce117f40=1603343105,1603766984,1603775871,1603790208; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1603795061', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
response = requests.get(url, headers=headers)
# print(response.content)
# 由chardet提供的正确编码
response.encoding = response.apparent_encoding
text = response.text
# print(text)
parse_html(text)
else:
parse_html(text)
def main(num):
for page in range(1, num + 1):
url = f'https://www.doutula.com/photo/list/?page={page}'
parse_page(url)
if __name__ == '__main__':
main(10)
end = time.time()
print('end-first:', end-first)
import threading, requests, random, time
from queue import Queue
from lxml import etree
from urllib import request
class Producer(threading.Thread):
def __init__(self, page_queue, img_queue):
super().__init__()
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_html(self, text):
html = etree.HTML(text)
pics = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
# print(pics)
for img in pics:
# get获得url
img_url = img.get('data-original')
# print(img_url)
name = list(img.get('alt'))
name = [i for i in name if i not in '/|']
# name = list(filter(lambda x:x not in '/|', name))
new_name = ''.join(name)
self.img_queue.put((img_url, new_name))
def parse_page(self, url):
headers_choice = [{
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50safari 5.1 – Windows'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)TT'}, {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},
{
'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},
{
'User-Agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},
{
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)Firefox 4.0.1 – MAC'}]
proxy_choice = [{'http': '54.214.52.181:80'}, {'http': '34.93.243.67:80'}, {'https': '119.27.170.46:8888'},
{'https': '58.220.95.80:9401'}]
headers = random.choice(headers_choice)
proxy = random.choice(proxy_choice)
try:
text = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
except requests.exceptions.ProxyError:
print('连接错误')
except UnicodeEncodeError:
print('编码更改')
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': '_agep=1601992588; _agfp=4b5864586b360b9b3511ad1fed0a73c1; _agtk=ad0fc9700101cd027f014ae151fe390a; Hm_lvt_2fc12699c699441729d4b335ce117f40=1603343105,1603766984,1603775871,1603790208; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1603795061',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
response = requests.get(url, headers=headers)
# print(response.content)
# 由chardet提供的正确编码
response.encoding = response.apparent_encoding
text = response.text
# print(text)
self.parse_html(text)
else:
self.parse_html(text)
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue):
super().__init__()
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
break
img_url, new_name = self.img_queue.get()
try:
request.urlretrieve(img_url, r'D:\pycharm\projects\test\pic/{}.jpg'.format(new_name))
except:
raise
else:
print(f'正在下载{new_name}')
def main(num):
page_queue = Queue(100)
img_queue = Queue(500)
for page in range(1, num + 1):
url = f'https://www.doutula.com/photo/list/?page={page}'
page_queue.put(url)
for i in range(5):
t = Producer(page_queue, img_queue)
t.start()
for i in range(5):
t = Consumer(page_queue, img_queue)
t.start()
t.join()
if __name__ == '__main__':
first = time.time()
main(10)
end = time.time()
print('end-first:', end-first)