python多线程下载文件模板代码(自用)

先说一下,这个代码是我经常下载文件得一段代码,拿去改改配置,就好了,

主要思路就是异步的操作数据库,多线程的下载文件 ,下载,前提是数据库文件名得自己设计成唯一得啊!我这个没有在这个代码里面体现,最终得文件路径是配置文件里面得路径前缀(前缀地址)+年+月+文件名(后缀地址)就是我们最终存放得文件地址,对了在说一下,在数据库里面我没有存文件得最终地址,只是放了文件得后缀地址,就是为了我们的存盘路径发生变化的时候,只需要改配置就行了。

配置文件中的前缀地址:
python多线程下载文件模板代码(自用)_第1张图片
数据库中的后缀地址:
在这里插入图片描述
在给大家看一下目录结构:
python多线程下载文件模板代码(自用)_第2张图片

db_config.py:

这个文件中除了数据库的相关配置,也存放了其他的一些配置啊!!!!

# -*- coding: UTF-8 -*-
import pymysql
#其他的配置
#请求头
USER_AGENT_LIST=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
#文件的下载地址
WENJIANDIZHI="/home/enterprise/staticrec/policy"

#估计还得配置一个日志文件
# 数据库信息
DB_TEST_HOST = "127.0.0.1"
DB_TEST_PORT = 3306
DB_TEST_DBNAME = "表名"
DB_TEST_USER = "user"
DB_TEST_PASSWORD = "pwd"
# 数据库连接编码
#DB_CHARSET = "utf8mb4"
DB_CHARSET = "utf8"
# mincached : 启动时开启的闲置连接数量(缺省值 0 开始时不创建连接)
DB_MIN_CACHED = 10

# maxcached : 连接池中允许的闲置的最多连接数量(缺省值 0 代表不闲置连接池大小)
DB_MAX_CACHED = 20

# maxshared : 共享连接数允许的最大数量(缺省值 0 代表所有连接都是专用的)如果达到了最大数量,被请求为共享的连接将会被共享使用
DB_MAX_SHARED = 20

# maxconnecyions : 创建连接池的最大数量(缺省值 0 代表不限制)
DB_MAX_CONNECYIONS = 100

# blocking : 设置在连接池达到最大数量时的行为(缺省值 0 或 False 代表返回一个错误 其他代表阻塞直到连接数减少,连接被分配)
DB_BLOCKING = True

# maxusage : 单个连接的最大允许复用次数(缺省值 0 或 False 代表不限制的复用).当达到最大数时,连接会自动重新连接(关闭和重新打开)
DB_MAX_USAGE = 0

# setsession : 一个可选的SQL命令列表用于准备每个会话,如["set datestyle to german", ...]
DB_SET_SESSION = None

# creator : 使用连接数据库的模块
DB_CREATOR = pymysql

#其他的配置文件


db_dbutils_init.py:

from DBUtils.PooledDB import PooledDB
from down import db_config as config

"""
@功能:创建数据库连接池
"""


class MyConnectionPool(object):
    __pool = None

    # def __init__(self):
    #     self.conn = self.__getConn()
    #     self.cursor = self.conn.cursor()

    # 创建数据库连接conn和游标cursor
    def __enter__(self):
        self.conn = self.__getconn()
        self.cursor = self.conn.cursor()

    # 创建数据库连接
    def __getconn(self):
        if self.__pool is None:
            self.__pool = PooledDB(
                creator=config.DB_CREATOR,
                mincached=config.DB_MIN_CACHED,
                maxcached=config.DB_MAX_CACHED,
                maxshared=config.DB_MAX_SHARED,
                maxconnections=config.DB_MAX_CONNECYIONS,
                blocking=config.DB_BLOCKING,
                maxusage=config.DB_MAX_USAGE,
                setsession=config.DB_SET_SESSION,
                host=config.DB_TEST_HOST,
                port=config.DB_TEST_PORT,
                user=config.DB_TEST_USER,
                passwd=config.DB_TEST_PASSWORD,
                db=config.DB_TEST_DBNAME,
                #use_unicode=False,
                charset=config.DB_CHARSET
            )
        return self.__pool.connection()

    # 释放连接池资源
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.cursor.close()
        self.conn.close()

    # 关闭连接归还给链接池
    # def close(self):
    #     self.cursor.close()
    #     self.conn.close()

    # 从连接池中取出一个连接
    def getconn(self):
        conn = self.__getconn()
        cursor = conn.cursor()
        return cursor, conn


# 获取连接池,实例化
def get_my_connection():
    return MyConnectionPool()

downFile.py:

from queue import Queue
from concurrent.futures import ThreadPoolExecutor
import requests
import random
import os
from down import db_config as config
import logging
import time
from down.mysqlhelper import MySqLHelper
#这个是创建一个先进先出的队列
def test_queue():
    q=Queue(10)
    #应该是先查出库来,然后我们根据库的大小定义队列的大小
    for i in range(5):
        q.put(i)
    while not q.empty():
        print(q.get())
#这是创建一个先进后出的队列
def test_LifoQueue():
    import queue
    # queue.LifoQueue() #后进先出->堆栈
    q = queue.LifoQueue(3)
    q.put(1)
    q.put(2)
    q.put(3)
    print(q.get())
    print(q.get())
    print(q.get())
#这里创建的一个优先队列
def test_PriorityQueue():
    import queue
    # queue.PriorityQueue() #优先级
    q = queue.PriorityQueue(3)  # 优先级,优先级用数字表示,数字越小优先级越高
    q.put((10, 'a'))
    q.put((-1, 'b'))
    q.put((100, 'c'))
    print(q.get())
    print(q.get())
    print(q.get())
#包括创建文件夹,创建文件
def muluwenjian(filePath):
    path = filePath[0:filePath.rfind('/')]
    isExists = os.path.exists(path)
    if not isExists:
        #如果不存在文件夹则创建
        os.makedirs(path)
    isFileExists=os.path.exists(filePath)
    if  isFileExists:
        #如果存在文件则删除
        os.remove(filePath)
    fp=open(filePath,'wb')
    #fp=open('ed_')
    return fp

def download(mes):
    try:
        url=mes[1]
        filePath=config.WENJIANDIZHI+mes[2]
        filePath=str(filePath)
        #filePath='C:/f/2020/efds/esft/egt_'
        header = {
        "User-Agent": config.USER_AGENT_LIST[random.randint(1,18)]
        }
        try:
            res=requests.get(url,headers=header)
            time.sleep(random.randint(3,6))
        except Exception as e:
            try:
               db.update('update t_policy_file_crawl set crawl_status="2" where id="{}"'.format(str(mes[0])))
            except Exception as e:
                raise  Exception('数据库插入失败--'+str(e)+'update t_policy_file_crawl set crawl_status=2 where id="{}"'.format(str(mes[0])))

            raise Exception("这里应该是我们的url链接出现了问题\r\n异常原因"+str(e))
            #logging.info("这里应该是我们的url链接出现了问题\r\n异常原因"+str(e))
        #下面应该加入文件的前缀地址D:什么, 什么的
        path = filePath[0:filePath.rfind('/')]
        name = filePath[filePath.rfind('/') + 1:]

        if res.status_code ==200:
            fp=muluwenjian(filePath)
            fp.write(res.content)
            fp.flush()
            logging.debug('成功了一个:'+filePath)
            try:
               db.update('update t_policy_file_crawl set crawl_status="1" where id="{}"'.format(str(mes[0])))
            except Exception as e:
                raise  Exception('数据库插入失败--'+str(e)+'update t_policy_file_crawl set crawl_status=1 where id="{}"'.format(str(mes[0])))
        else:
            raise Exception('我们的响应状态码不对。这里报错')
    except Exception as reason:
        logging.debug('下载失败')
        logging.debug('url:'+url)
        logging.debug('path:'+filePath)
        logging.debug(reason)
    finally:
        logging.debug('----------------')
        fp.close()
def btostr(bss):
    lis=[]
    for bs in bss :
        a = []
        for b in bs:
            if isinstance(b,bytes):
               c= b.decode('utf-8')
               a.append(c)
            else:a.append(b)
        lis.append(a)
    return lis
if __name__ == '__main__':
    LOG_FORMAT = "%(asctime)s %(name)s %(levelname)s %(pathname)s %(message)s "  # 配置输出日志格式
    DATE_FORMAT = '%Y-%m-%d  %H:%M:%S %a '  # 配置输出时间的格式,注意月份和天数不要搞乱了
    logging.basicConfig(level=logging.DEBUG,
                        format=LOG_FORMAT,
                        datefmt=DATE_FORMAT,
                        filemode='a',#filemode:和file函数意义相同,指定日志文件的打开模式,'w'或者'a',默认值为’a’,表示日志消息以追加的形式添加到日志文件中。如果设为’w’, 那么每次程序启动的时候都会创建一个新的日志文件;
                        filename=r"logs/down.log"  # 有了filename参数就不会直接输出显示到控制台,而是直接写入文件
                        )

    db = MySqLHelper()
    #我在数据库设计得时候再数据库中加了一个下载得字段,没有下载为0,已经下载为1。下面就是我们查出我们没有下载得链接
    res = db.selectMany('select id,file_url,file_location from t_policy_file_crawl where crawl_status=0')
    res=btostr(res)
    with ThreadPoolExecutor(max_workers=3) as executer:
        #采用多线程得方式下载文件
        executer.map(download,res[0:3])#参数1:下载方法    参数2:需要下载得链接
        #executer.map(download, res)


mysqlhelper.py:

from down.db_dbutils_init import get_my_connection

"""执行语句查询有结果返回结果没有返回0;增/删/改返回变更数据条数,没有返回0"""


class MySqLHelper(object):
    def __init__(self):
        self.db = get_my_connection()  # 从数据池中获取连接

    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, 'inst'):  # 单例
            cls.inst = super(MySqLHelper, cls).__new__(cls, *args, **kwargs)
        return cls.inst

    # 封装执行命令
    def execute(self, sql, param=None, autoclose=False):
        """
        【主要判断是否有参数和是否执行完就释放连接】
        :param sql: 字符串类型,sql语句
        :param param: sql语句中要替换的参数"select %s from tab where id=%s" 其中的%s就是参数
        :param autoclose: 是否关闭连接
        :return: 返回连接conn和游标cursor
        """
        cursor, conn = self.db.getconn()  # 从连接池获取连接
        count = 0
        try:
            # count : 为改变的数据条数
            if param:
                count = cursor.execute(sql, param)
            else:
                count = cursor.execute(sql)
            conn.commit()
            if autoclose:
                self.close(cursor, conn)
        except Exception as e:
            pass
        return cursor, conn, count

    # 执行多条命令
    # def executemany(self, lis):
    #     """
    #     :param lis: 是一个列表,里面放的是每个sql的字典'[{"sql":"xxx","param":"xx"}....]'
    #     :return:
    #     """
    #     cursor, conn = self.db.getconn()
    #     try:
    #         for order in lis:
    #             sql = order['sql']
    #             param = order['param']
    #             if param:
    #                 cursor.execute(sql, param)
    #             else:
    #                 cursor.execute(sql)
    #         conn.commit()
    #         self.close(cursor, conn)
    #         return True
    #     except Exception as e:
    #         print(e)
    #         conn.rollback()
    #         self.close(cursor, conn)
    #         return False

    # 释放连接
    def close(self, cursor, conn):
        """释放连接归还给连接池"""
        cursor.close()
        conn.close()

    # 查询所有
    def selectall(self, sql, param=None):
        try:
            cursor, conn, count = self.execute(sql, param)
            res = cursor.fetchall()
            return res
        except Exception as e:
            print(e)
            self.close(cursor, conn)
            return count

    # 查询单条
    def selectone(self, sql, param=None):
        try:
            cursor, conn, count = self.execute(sql, param)
            res = cursor.fetchone()
            self.close(cursor, conn)
            return res
        except Exception as e:
            print("error_msg:", e.args)
            self.close(cursor, conn)
            return count
   # 查询多条
    def selectMany(self, sql, param=None):
        try:
            cursor, conn, count = self.execute(sql, param)
            res = cursor.fetchall()
            self.close(cursor, conn)
            return res
        except Exception as e:
            print("error_msg:", e.args)
            self.close(cursor, conn)
            return count
    # 增加
    def insertone(self, sql, param):
        try:
            cursor, conn, count = self.execute(sql, param)
            # _id = cursor.lastrowid()  # 获取当前插入数据的主键id,该id应该为自动生成为好
            conn.commit()
            self.close(cursor, conn)
            return count
            # 防止表中没有id返回0
            # if _id == 0:
            #     return True
            # return _id
        except Exception as e:
            print(e)
            conn.rollback()
            self.close(cursor, conn)
            return count

    # 增加多行
    def insertmany(self, sql, param):
        """
        :param sql:
        :param param: 必须是元组或列表[(),()]或((),())
        :return:
        """
        cursor, conn, count = self.db.getconn()
        try:
            cursor.executemany(sql, param)
            conn.commit()
            return count
        except Exception as e:
            print(e)
            conn.rollback()
            self.close(cursor, conn)
            return count

    # 删除
    def delete(self, sql, param=None):
        try:
            cursor, conn, count = self.execute(sql, param)
            self.close(cursor, conn)
            return count
        except Exception as e:
            print(e)
            conn.rollback()
            self.close(cursor, conn)
            return count

    # 更新
    def update(self, sql, param=None):
        try:
            cursor, conn, count = self.execute(sql, param)
            conn.commit()
            self.close(cursor, conn)
            return count
        except Exception as e:
            print(e)
            conn.rollback()
            self.close(cursor, conn)
            return count

#
# if __name__ == '__main__':
#     db = MySqLHelper()
    # # 查询单条
    # sql1 = 'select * from userinfo where name=%s'
    # args = 'python'
    # ret = db.selectone(sql=sql1, param=args)
    # print(ret)  # (None, b'python', b'123456', b'0')
    # 增加单条
    # sql2 = 'insert into userinfo (name,password) VALUES (%s,%s)'
    # ret = db.insertone(sql2, ('old2','123456'))
    # print(ret)
    # 增加多条
    # sql3 = 'insert into userinfo (name,password) VALUES (%s,%s)'
    # li = li = [
    #     ('123', '456'),
    #     ('789','147')
    # ]
    # ret = db.insertmany(sql3,li)
    # print(ret)
    # 删除
    # sql4 = 'delete from  userinfo WHERE name=%s'
    # args = 'xxxx'
    # ret = db.delete(sql4, args)
    # print(ret)
    # 更新
    # sql5 = r'update userinfo set password=%s WHERE name LIKE %s'
    # args = ('123456789', '%old%')
    # ret = db.update(sql5, args)
    # print(ret)

先说明一下,上面的查询数据库代码是我粘贴别人的,但是我在他的代码上做一些修改,他的增删改查不适合我,我做了一些修改。原作者也找不到了,所以这里就没有发出原文链接。

你可能感兴趣的:(python多线程下载文件模板代码(自用))