import requests
from bs4 import BeautifulSoup
import os
import time
from tkinter import *
import threading
import _thread
import urllib
def downloadFile(name, urla_):
'''
:param name:下载保存的名称
:param url: 下载链接
:return:
'''
url = urla_.strip()
print(url)
headers = {'Proxy-Connection': 'keep-alive'}
if os.path.exists(name):
print('file exiiss')
return
r = requests.get(url, stream=True)
try:
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
time1 = time.time()
f.close()
except:
print('该文件下载失败,需手动下载:' + url)
pass
def formatFloat(num):
return '{:.2f}'.format(num)
def createFolder(folder_name):
'''
创建文件夹
:param folder_name: 文件夹名称
:return:
'''
if os.path.exists(folder_name):
print('该文件夹存在')
return True
print('开始创建文件夹:'+folder_name)
os.mkdir(folder_name)
def downloadPhotoByUrl(fo_,url):
'''
解析Url 然后下载图片啥
:param url:
:return:
'''
# print('进行下载')
res = requests.get(url)
res.encoding = 'utf-8'
bs_ = BeautifulSoup(res.text,'lxml')
div_title = bs_.find('div',class_="Block Moveable Panel PrimaryProductDetails")
h2 = div_title.find('h2')
# 需要创建的文件夹名称
folder_name_ = h2.text
folder_name = folder_name_.replace('''"''',"_")
xx_ = folder_name.replace('/','1')
yy_ = xx_.replace(' ','2')
if len(yy_) > 20:
yy_ = yy_[0:20]
#print("其中的folder:" +xx_)
fod_ = fo_ + '/'+yy_
createFolder(fod_)
#print(bs_.contents)
div_des = bs_.find('div',class_= 'ProductDescriptionContainer')
print(div_des.text)
#print(div_des.text)
xx_p = div_des.find_all('p')
i = 0
txt_name = fod_ + '/' + '描述文本.txt'
f = open(txt_name,'w+',encoding="utf-8") # 设置成utf-8
#f.write(div_des.text)
str_ = str(div_des)
lv_ = str_.replace("
","\n")
div_str = lv_.replace("
"
,"") pos_wq = ls_pp.find(">") xxx = pp_[pos_wq+1:] xx11_pos = xxx.find(') xxx1_ss = xxx.find('''">''') sxz = xxx[xxx1_ss+2:] xxa = sxz.replace("","") prss = xxa.replace("","") sxxaa = prss.replace("","") f.write(sxxaa) # 标题和xc都写入文件夹 # for xx in xx_p: # if i == 0: # # 标题 # title_ = xx.text # print(title_) # f.writelines(title_) # i = i + 1 # continue # str_ = str(xx) # xa = str_.replace('","")
# xc = xb.replace("",'') # # xc 最终的字段 # print(xc) # f.write(xc) f.close() div_ = bs_.find('div',class_='ProductTinyImageList') a_list = div_.find_all('a') # print(a_list) for a in a_list: #print(len(a.get('rel'))) #print(a.get('rel')[5]) # 需要下载的链接 _url = a.get('rel')[5] down_ = _url.replace('''"''','') download_url = down_.replace("}",'') print(download_url) pos_first = download_url.rfind('/') pos_last = download_url.rfind('?') photo_name = download_url[pos_first+1:pos_last] bf_ = photo_name.replace('/','1') kod_name = fod_ + '/' + bf_ # downloadFile(kod_name,download_url) if os.path.exists(kod_name): print(kod_name) else: urllib.request.urlretrieve(download_url, kod_name) def analysisUrl(url_): # http://www.madhornets.com/diagnostic-scanner-adapter-obd-reader/?sort=bestselling&page=2 str_ = "page=" if url_.count(str_) > 0: x_pos = url_.find('?sort=bestselling&page=') url_se= url_[0:x_pos] xx_ = url_se[:-1] xx_pos = xx_.rfind('/') # 大文件夹 bfolder_name_ = xx_[xx_pos + 1:] bfolder_name = bfolder_name_.replace("/", '1') # 更换大文件夹名称 if len(bfolder_name) > 10: mk_folen = bfolder_name[0:10] createFolder(mk_folen) res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(mk_folen, link_url) return print(bfolder_name) createFolder(bfolder_name) # 创建文件夹 res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(bfolder_name, link_url) else: xx_ = url_[:-1] xx_pos = xx_.rfind('/') # 大文件夹 bfolder_name_ = xx_[xx_pos + 1:] bfolder_name = bfolder_name_.replace("/", '1') # 更换大文件夹名称 if len(bfolder_name) > 10: mk_folen = bfolder_name[0:10] createFolder(mk_folen) res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(mk_folen, link_url) return print(bfolder_name) createFolder(bfolder_name) # 创建文件夹 res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(bfolder_name, link_url) def MainKind(url): ''' 主方法入口 :param url: :return: ''' analysisUrl(url) print('所有下载完成!!!!!!!!!!') def DemoBtn(): print('demobtn') value = txt_city.get() _thread.start_new_thread(MainKind,(value,)) if __name__ == '__main__': root = Tk() root.title("爬虫下载工具") root.geometry('500x400') # 窗口大小:宽*高 root.resizable(width=True, height=True) # 设置宽高不可变 # 添加文本框 txt_city = Entry(root) txt_city.place(x=10, y=150, width=450, height=30) btn_ScreenShot = Button(root, text="开始下载", command=DemoBtn) btn_ScreenShot.place(width=90, height=30, x=20, y=300) root.mainloop()2.下载某网站的商品信息爬取
import requests
from bs4 import BeautifulSoup
import os
import time
from tkinter import *
import threading
import _thread
import urllib
def downloadFile(name, urla_):
'''
:param name:下载保存的名称
:param url: 下载链接
:return:
'''
url = urla_.strip()
print(url)
headers = {'Proxy-Connection': 'keep-alive'}
if os.path.exists(name):
print('file exiiss')
return
r = requests.get(url, stream=True)
try:
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
time1 = time.time()
f.close()
except:
print('该文件下载失败,需手动下载:' + url)
pass
def formatFloat(num):
return '{:.2f}'.format(num)
def createFolder(folder_name):
'''
创建文件夹
:param folder_name: 文件夹名称
:return:
'''
if os.path.exists(folder_name):
print('该文件夹存在')
return True
print('开始创建文件夹:'+folder_name)
os.mkdir(folder_name)
def downloadPhotoByUrl(fo_,url):
'''
解析Url 然后下载图片啥
:param url:
:return:
'''
print('进行下载')
res = requests.get(url)
res.encoding = 'utf-8'
bs_ = BeautifulSoup(res.text,'lxml')
div_title = bs_.find('div',class_="Block Moveable Panel PrimaryProductDetails")
h2 = div_title.find('h2')
# 需要创建的文件夹名称
folder_name = h2.text
xx_ = folder_name.replace('/','_')
print("其中的folder:" +xx_)
fod_ = fo_ + '/'+xx_
createFolder(fod_)
#print(bs_.contents)
div_des = bs_.find('div',class_= 'ProductDescriptionContainer')
print(div_des)
#print(div_des.text)
xx_p = div_des.find_all('p')
i = 0
txt_name = fod_ + '/' + '描述文本.txt'
f = open(txt_name,'w+')
# 标题和xc都写入文件夹
for xx in xx_p:
if i == 0:
# 标题
title_ = xx.text
f.writelines(title_)
i = i + 1
continue
str_ = str(xx)
xa = str_.replace('
',"\n")
xb = xa.replace(""
,"")
xc = xb.replace("",'')
# xc 最终的字段
print(xc)
f.write(xc)
f.close()
div_ = bs_.find('div',class_='ProductTinyImageList')
a_list = div_.find_all('a')
# print(a_list)
for a in a_list:
#print(len(a.get('rel')))
#print(a.get('rel')[5])
# 需要下载的链接
_url = a.get('rel')[5]
down_ = _url.replace('''"''','')
download_url = down_.replace("}",'')
print(download_url)
pos_first = download_url.rfind('/')
pos_last = download_url.rfind('?')
photo_name = download_url[pos_first+1:pos_last]
print(photo_name)
kod_name = fod_ + '/' + photo_name
#downloadFile(kod_name,download_url)
urllib.request.urlretrieve(download_url,kod_name)
def analysisUrl(url_):
xx_ = url_[:-1]
xx_pos = xx_.rfind('/')
# 大文件夹
bfolder_name = xx_[xx_pos+1:]
createFolder(bfolder_name)
# 创建文件夹
res = requests.get(url_)
res.encoding = 'utf-8'
bs_ = BeautifulSoup(res.text,'lxml')
# 获取每个的标题
# print(bs_.contents)
product_list = bs_.find('ul',class_= 'ProductList')
dob_ = product_list.find_all('div',class_='ProductImage QuickView')
for dd in dob_:
a_link = dd.find('a')
print(a_link.get('href'))
# 链接url
link_url = a_link.get('href')
downloadPhotoByUrl(bfolder_name,link_url)
def MainKind(url):
'''
主方法入口
:param url:
:return:
'''
analysisUrl(url)
print('所有下载完成!!!!!!!!!!')
def DemoBtn():
print('demobtn')
value = txt_city.get()
_thread.start_new_thread(MainKind,(value,))
if __name__ == '__main__':
root = Tk()
root.title("爬虫下载工具")
root.geometry('500x400') # 窗口大小:宽*高
root.resizable(width=True, height=True) # 设置宽高不可变
# 添加文本框
txt_city = Entry(root)
txt_city.place(x=10, y=150, width=450, height=30)
btn_ScreenShot = Button(root, text="开始下载", command=DemoBtn)
btn_ScreenShot.place(width=90, height=30, x=20, y=300)
root.mainloop()