近期的一系列爬虫代码

    1. 爬取某考研网站的历年真题(仅供学习使用,不用于任何其他商业目的)
import requests
from bs4 import BeautifulSoup
import os
import time
from tkinter import *
import threading
import _thread
import urllib

def downloadFile(name, urla_):
    '''
    :param name:下载保存的名称
    :param url: 下载链接
    :return:
    '''
    url = urla_.strip()
    print(url)
    headers = {'Proxy-Connection': 'keep-alive'}
    if os.path.exists(name):
        print('file exiiss')
        return
    r = requests.get(url, stream=True)
    try:
        length = float(r.headers['content-length'])
        f = open(name, 'wb')
        count = 0
        count_tmp = 0
        time1 = time.time()
        for chunk in r.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
                count += len(chunk)
                if time.time() - time1 > 2:
                    p = count / length * 100
                    speed = (count - count_tmp) / 1024 / 1024 / 2
                    count_tmp = count
                    print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                    time1 = time.time()
        f.close()
    except:
        print('该文件下载失败,需手动下载:' + url)
        pass


def formatFloat(num):
    return '{:.2f}'.format(num)

def createFolder(folder_name):
    '''
    创建文件夹
    :param folder_name: 文件夹名称
    :return:
    '''
    if os.path.exists(folder_name):
        print('该文件夹存在')
        return True
    print('开始创建文件夹:'+folder_name)
    os.mkdir(folder_name)


def downloadPhotoByUrl(fo_,url):
    '''
    解析Url 然后下载图片啥
    :param url:
    :return:
    '''
    # print('进行下载')
    res = requests.get(url)
    res.encoding = 'utf-8'
    bs_ = BeautifulSoup(res.text,'lxml')
    div_title = bs_.find('div',class_="Block Moveable Panel PrimaryProductDetails")
    h2 = div_title.find('h2')
    # 需要创建的文件夹名称
    folder_name_ = h2.text
    folder_name = folder_name_.replace('''"''',"_")
    xx_ = folder_name.replace('/','1')
    yy_ = xx_.replace(' ','2')
    if len(yy_) > 20:
        yy_ = yy_[0:20]
    #print("其中的folder:" +xx_)
    fod_ = fo_ + '/'+yy_
    createFolder(fod_)
    #print(bs_.contents)
    div_des = bs_.find('div',class_= 'ProductDescriptionContainer')
    print(div_des.text)
    #print(div_des.text)
    xx_p = div_des.find_all('p')
    i = 0
    txt_name = fod_ + '/' + '描述文本.txt'
    f = open(txt_name,'w+',encoding="utf-8") # 设置成utf-8
    #f.write(div_des.text)
    str_ = str(div_des)
    lv_ = str_.replace("
"
,"\n") div_str = lv_.replace("
","") ls_pp = div_str.replace("

"
,"\n") pp_ = ls_pp.replace("

","") pos_wq = ls_pp.find(">") xxx = pp_[pos_wq+1:] xx11_pos = xxx.find(') xxx1_ss = xxx.find('''">''') sxz = xxx[xxx1_ss+2:] xxa = sxz.replace("","") prss = xxa.replace("","") sxxaa = prss.replace("","") f.write(sxxaa) # 标题和xc都写入文件夹 # for xx in xx_p: # if i == 0: # # 标题 # title_ = xx.text # print(title_) # f.writelines(title_) # i = i + 1 # continue # str_ = str(xx) # xa = str_.replace('
',"\n")
# xb = xa.replace("

","") # xc = xb.replace("

",'')
# # xc 最终的字段 # print(xc) # f.write(xc) f.close() div_ = bs_.find('div',class_='ProductTinyImageList') a_list = div_.find_all('a') # print(a_list) for a in a_list: #print(len(a.get('rel'))) #print(a.get('rel')[5]) # 需要下载的链接 _url = a.get('rel')[5] down_ = _url.replace('''"''','') download_url = down_.replace("}",'') print(download_url) pos_first = download_url.rfind('/') pos_last = download_url.rfind('?') photo_name = download_url[pos_first+1:pos_last] bf_ = photo_name.replace('/','1') kod_name = fod_ + '/' + bf_ # downloadFile(kod_name,download_url) if os.path.exists(kod_name): print(kod_name) else: urllib.request.urlretrieve(download_url, kod_name) def analysisUrl(url_): # http://www.madhornets.com/diagnostic-scanner-adapter-obd-reader/?sort=bestselling&page=2 str_ = "page=" if url_.count(str_) > 0: x_pos = url_.find('?sort=bestselling&page=') url_se= url_[0:x_pos] xx_ = url_se[:-1] xx_pos = xx_.rfind('/') # 大文件夹 bfolder_name_ = xx_[xx_pos + 1:] bfolder_name = bfolder_name_.replace("/", '1') # 更换大文件夹名称 if len(bfolder_name) > 10: mk_folen = bfolder_name[0:10] createFolder(mk_folen) res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(mk_folen, link_url) return print(bfolder_name) createFolder(bfolder_name) # 创建文件夹 res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(bfolder_name, link_url) else: xx_ = url_[:-1] xx_pos = xx_.rfind('/') # 大文件夹 bfolder_name_ = xx_[xx_pos + 1:] bfolder_name = bfolder_name_.replace("/", '1') # 更换大文件夹名称 if len(bfolder_name) > 10: mk_folen = bfolder_name[0:10] createFolder(mk_folen) res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(mk_folen, link_url) return print(bfolder_name) createFolder(bfolder_name) # 创建文件夹 res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text, 'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul', class_='ProductList') dob_ = product_list.find_all('div', class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') # print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(bfolder_name, link_url) def MainKind(url): ''' 主方法入口 :param url: :return: ''' analysisUrl(url) print('所有下载完成!!!!!!!!!!') def DemoBtn(): print('demobtn') value = txt_city.get() _thread.start_new_thread(MainKind,(value,)) if __name__ == '__main__': root = Tk() root.title("爬虫下载工具") root.geometry('500x400') # 窗口大小:宽*高 root.resizable(width=True, height=True) # 设置宽高不可变 # 添加文本框 txt_city = Entry(root) txt_city.place(x=10, y=150, width=450, height=30) btn_ScreenShot = Button(root, text="开始下载", command=DemoBtn) btn_ScreenShot.place(width=90, height=30, x=20, y=300) root.mainloop()

2.下载某网站的商品信息爬取

import requests
from bs4 import BeautifulSoup
import os
import time
from tkinter import *
import threading
import _thread
import urllib

def downloadFile(name, urla_):
    '''
    :param name:下载保存的名称
    :param url: 下载链接
    :return:
    '''
    url = urla_.strip()
    print(url)
    headers = {'Proxy-Connection': 'keep-alive'}
    if os.path.exists(name):
        print('file exiiss')
        return
    r = requests.get(url, stream=True)
    try:
        length = float(r.headers['content-length'])
        f = open(name, 'wb')
        count = 0
        count_tmp = 0
        time1 = time.time()
        for chunk in r.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
                count += len(chunk)
                if time.time() - time1 > 2:
                    p = count / length * 100
                    speed = (count - count_tmp) / 1024 / 1024 / 2
                    count_tmp = count
                    print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                    time1 = time.time()
        f.close()
    except:
        print('该文件下载失败,需手动下载:' + url)
        pass


def formatFloat(num):
    return '{:.2f}'.format(num)

def createFolder(folder_name):
    '''
    创建文件夹
    :param folder_name: 文件夹名称
    :return:
    '''
    if os.path.exists(folder_name):
        print('该文件夹存在')
        return True
    print('开始创建文件夹:'+folder_name)
    os.mkdir(folder_name)


def downloadPhotoByUrl(fo_,url):
    '''
    解析Url 然后下载图片啥
    :param url:
    :return:
    '''
    print('进行下载')
    res = requests.get(url)
    res.encoding = 'utf-8'
    bs_ = BeautifulSoup(res.text,'lxml')
    div_title = bs_.find('div',class_="Block Moveable Panel PrimaryProductDetails")
    h2 = div_title.find('h2')
    # 需要创建的文件夹名称
    folder_name = h2.text
    xx_ = folder_name.replace('/','_')
    print("其中的folder:" +xx_)
    fod_ = fo_ + '/'+xx_
    createFolder(fod_)
    #print(bs_.contents)
    div_des = bs_.find('div',class_= 'ProductDescriptionContainer')
    print(div_des)
    #print(div_des.text)
    xx_p = div_des.find_all('p')
    i = 0
    txt_name = fod_ + '/' + '描述文本.txt'
    f = open(txt_name,'w+')
    # 标题和xc都写入文件夹
    for xx in xx_p:
        if i == 0:
            # 标题
            title_ = xx.text
            f.writelines(title_)
            i = i + 1
            continue
        str_ = str(xx)
        xa = str_.replace('
'
,"\n") xb = xa.replace("

","") xc = xb.replace("

"
,'') # xc 最终的字段 print(xc) f.write(xc) f.close() div_ = bs_.find('div',class_='ProductTinyImageList') a_list = div_.find_all('a') # print(a_list) for a in a_list: #print(len(a.get('rel'))) #print(a.get('rel')[5]) # 需要下载的链接 _url = a.get('rel')[5] down_ = _url.replace('''"''','') download_url = down_.replace("}",'') print(download_url) pos_first = download_url.rfind('/') pos_last = download_url.rfind('?') photo_name = download_url[pos_first+1:pos_last] print(photo_name) kod_name = fod_ + '/' + photo_name #downloadFile(kod_name,download_url) urllib.request.urlretrieve(download_url,kod_name) def analysisUrl(url_): xx_ = url_[:-1] xx_pos = xx_.rfind('/') # 大文件夹 bfolder_name = xx_[xx_pos+1:] createFolder(bfolder_name) # 创建文件夹 res = requests.get(url_) res.encoding = 'utf-8' bs_ = BeautifulSoup(res.text,'lxml') # 获取每个的标题 # print(bs_.contents) product_list = bs_.find('ul',class_= 'ProductList') dob_ = product_list.find_all('div',class_='ProductImage QuickView') for dd in dob_: a_link = dd.find('a') print(a_link.get('href')) # 链接url link_url = a_link.get('href') downloadPhotoByUrl(bfolder_name,link_url) def MainKind(url): ''' 主方法入口 :param url: :return: ''' analysisUrl(url) print('所有下载完成!!!!!!!!!!') def DemoBtn(): print('demobtn') value = txt_city.get() _thread.start_new_thread(MainKind,(value,)) if __name__ == '__main__': root = Tk() root.title("爬虫下载工具") root.geometry('500x400') # 窗口大小:宽*高 root.resizable(width=True, height=True) # 设置宽高不可变 # 添加文本框 txt_city = Entry(root) txt_city.place(x=10, y=150, width=450, height=30) btn_ScreenShot = Button(root, text="开始下载", command=DemoBtn) btn_ScreenShot.place(width=90, height=30, x=20, y=300) root.mainloop()

你可能感兴趣的:(Python)