爬取名称、md5值、下载链接,批量下载升级包,计算文件MD5值

今日学习

功能介绍
  • 计算指定文件的md5值
  • 爬取绿盟升级包名称和md5值,并生成excle
  • 计算指定文件夹里的所有文件的md5值并与官网上爬取的绿盟升级包名称和md5值做比较
  • 自动下载同一类型(增量更新)升级包到指定文件夹
import requests
from lxml import etree
import os
import hashlib
import getopt
import sys
import xlwt
url = ''
path = ''
# 爬取网页上升级包的名称和md5值,放到字典dic1中
def spider_filename_md5(url):
    response = requests.get(url)
    html = response.content
    html = str(html, encoding='utf-8')
    s = etree.HTML(html)
    name = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr/td/a')
    name = [str(i.text) for i in name]
    md5 = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr[2]/td[1]/text()')
    md5 = [str(i.strip()) for i in md5]
    dic1 = dict(zip(name, md5))
    excel(dic1)
    return dic1

def excel(dic):
    # 创建workbook(其实就是excel,后来保存一下就行)
    workbook = xlwt.Workbook(encoding='utf-8')
    # 创建表
    worksheet = workbook.add_sheet('sheet1')
    # 往单元格内写入内容:写入表头
    worksheet.write(0, 0, label="name")
    worksheet.write(0, 1, label="md5")
    # 往单元格内写入内容:写入内容
    i = 1
    for word in dic:
        worksheet.write(i, 0, label=word)
        worksheet.write(i, 1, label=dic[word])
        i = i + 1
    workbook.save('md5.xls')
# 计算文件md5方法
def md5(path,Bytes=1024):
    md5_1 = hashlib.md5()
    with open(path,'rb') as f:
        while 1:
            data =f.read(Bytes)
            if data:
                md5_1.update(data)
            else:
                break
    ret = md5_1.hexdigest()
    return ret

# 取出指定文件夹中升级包的名称和md5值,放到字典dic1中
def local_filename_md5(path):
    name = [file for file in os.listdir(path)]
    dic2 = {}
    for i in name:
        dis={i:md5(path+'\\'+i)}
        dic2.update(dis)
    return dic2
# 本地文件的md5与正确文件的md5比对
def duibi(dic1,dic2):
    for i in dic2.keys():
        # print(i)
        # print(dic1.keys())
        if i in dic1.keys():
            if dic2[i] == dic1[i]:
              #  print(i+'文件正确')
                pass
            else:
                print(i+':md5值不正确')
        else:
             print('目标网址无此本地文件:'+i)

# 自动从官网下载补丁包(增量更新)
def down_file(url):
    response = requests.get(url)
    html = response.content
    html = str(html, encoding='utf-8')
    s = etree.HTML(html)
    down_path = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr[1]/td[1]/a/@href')
    down_path = [str('http://update.nsfocus.com' + i) for i in down_path]
    name = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr/td/a')
    name = [str(i.text) for i in name]
    dic3 = dict(zip(down_path, name))
    file_folder_name = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/h2')
    file_folder_name = [str(i.text) for i in file_folder_name]
    headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding':'gzip, deflate',
            'Connection':'close',
            'Upgrade-Insecure-Requests':'1',
        }
    for i in down_path:
        download = requests.get(i,headers=headers)
        path = os.path.abspath('.')
        name_path1 = path+'\\'+str(file_folder_name[0])
        name_path2 = name_path1 +'\\'+str(dic3[i])
        if not os.path.exists(name_path1):
            os.mkdir(name_path1)
        else:
            if not os.path.exists(name_path2):
                with open(name_path2, 'wb') as f:
                    f.write(download.content)
            else:
                pass

def use():
    print("helpinfo:")
    print("获取指定文件的md5值。格式:-m -p 文件路径包含文件名字;例子 python md5.py -m -p c:\\a\\p.bat")
    print("爬取指定网页升级包名称和md5值,并生成md5.excel。格式:-n -u url ;例子 python md5.py -n -u http://*.*.*.* ")
    print("获取指定文件夹内所有文件的md5值,并与官网比较。格式:-l -u url -s 文件夹目录 ")
    print("下载指定网页的升级包(增量更新) 格式:-d -u url")
opts,args = getopt.getopt(sys.argv[1:], "hldmnsp:u:")
if __name__ == '__main__':
    listen = None
    for a,b in opts:
        if a == "-h":
            use()
        elif a == "-m":
            listen = 1
        elif a == "-n":
            listen = 2
        elif a == "-l":
            listen = 3
        elif a == "-p":
            path = b
        elif a == "-u":
            url = b
        elif a == "-d":
            listen = 4
        else:
            assert False, "Unhandled Option"
    if listen == 1:
        print(md5(path))
    elif listen == 2:
        spider_filename_md5(url)
    elif listen == 3:
        duibi(spider_filename_md5(url),local_filename_md5(path))
    elif listen == 4:
        down_file(url)
    else:
        pass

你可能感兴趣的:(爬取名称、md5值、下载链接,批量下载升级包,计算文件MD5值)