import os
import shutil
import threading
import lxml.html
import requests
list_href = []
class Download(object):
current_num = 0
def __init__(self, output, hf_list):
self.output = output
self.hf_list = hf_list
self._value_lock = threading.Lock()
def downJpgList(self):
for jpg_url in self.hf_list:
print(jpg_url)
res = requests.get(jpg_url)
imageFile = open(os.path.join(output, os.path.basename(jpg_url)), 'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
def get_url_download(url_page, current_num, total_num):
global list_href
url_page_arr = url_page.rsplit('.', maxsplit=1)
while current_num <= total_num:
url_page_new = url_page_arr[0] + r'_%s.' + url_page_arr[1]
current_num += 1
s_content = requests.get(url_page_new % (current_num))
tree_html = lxml.html.fromstring(s_content.text)
href = [img.get('src') for img in tree_html.cssselect('.articleBody a img')]
list_href.extend(href)
def get_url_first_download(url_page):
global list_href
s_content = requests.get(url_page)
tree_html = lxml.html.fromstring(s_content.text)
href = [img.get('src') for img in tree_html.cssselect('.articleBody a img')]
list_href.extend(href)
def rand_generate():
import random
lst = [chr(i) for i in range(97, 123)]
lst.extend(chr(i) for i in range(65, 91))
lst.extend(i for i in range(0, 10))
lst = list(map(str, lst))
return ''.join(random.sample(lst, 12))
def view_bar(num=1, sum=100, bar_word=':'):
rate = float(num) / float(sum)
rate_num = int(rate * 100)
os.write(1, bytes('\r%{}:'.format(rate_num), 'gbk'))
for i in range(0, num):
os.write(1, bytes(bar_word, 'gbk'))
sys.stdout.flush()
if __name__ == '__main__':
import sys, getopt, random
outputfile = rand_generate()
opts, args = getopt.getopt(sys.argv[1:], "hn:o:", ["--name=", "--output="])
for opt, arg in opts:
if opt == '-h':
print('test.py -n -o url_page')
sys.exit()
elif opt in ("-n", "--name"):
model_name = arg
elif opt in ("-o", "--output"):
outputfile = arg
url_page = args[0]
print('outputfile ', outputfile, 'url_page ', url_page)
output = os.path.join('I:\chuan\down', outputfile)
if os.path.exists(output):
shutil.rmtree(output)
os.mkdir(output)
ss = requests.get(url_page)
ss.encoding = 'utf-8'
tree = lxml.html.fromstring(ss.text)
num = tree.cssselect('.pages > span')[0].text_content()
nums = str(num).split("/")
current_num = int(nums[0])
total_num = int(nums[1])
get_url_first_download(url_page)
get_url_download(url_page, current_num, total_num)
downloadThreads = []
for i in range(5):
download = Download(output, [href for href in list_href[i::5]])
downloadThread = threading.Thread(target=download.downJpgList)
downloadThreads.append(downloadThread)
downloadThread.start()
for downloadThread in downloadThreads:
downloadThread.join()
print('ok')