配置文件中的前缀地址:
数据库中的后缀地址:
在给大家看一下目录结构:
这个文件中除了数据库的相关配置,也存放了其他的一些配置啊!!!!
# -*- coding: UTF-8 -*-
import pymysql
#其他的配置
#请求头
USER_AGENT_LIST=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
#文件的下载地址
WENJIANDIZHI="/home/enterprise/staticrec/policy"
#估计还得配置一个日志文件
# 数据库信息
DB_TEST_HOST = "127.0.0.1"
DB_TEST_PORT = 3306
DB_TEST_DBNAME = "表名"
DB_TEST_USER = "user"
DB_TEST_PASSWORD = "pwd"
# 数据库连接编码
#DB_CHARSET = "utf8mb4"
DB_CHARSET = "utf8"
# mincached : 启动时开启的闲置连接数量(缺省值 0 开始时不创建连接)
DB_MIN_CACHED = 10
# maxcached : 连接池中允许的闲置的最多连接数量(缺省值 0 代表不闲置连接池大小)
DB_MAX_CACHED = 20
# maxshared : 共享连接数允许的最大数量(缺省值 0 代表所有连接都是专用的)如果达到了最大数量,被请求为共享的连接将会被共享使用
DB_MAX_SHARED = 20
# maxconnecyions : 创建连接池的最大数量(缺省值 0 代表不限制)
DB_MAX_CONNECYIONS = 100
# blocking : 设置在连接池达到最大数量时的行为(缺省值 0 或 False 代表返回一个错误 其他代表阻塞直到连接数减少,连接被分配)
DB_BLOCKING = True
# maxusage : 单个连接的最大允许复用次数(缺省值 0 或 False 代表不限制的复用).当达到最大数时,连接会自动重新连接(关闭和重新打开)
DB_MAX_USAGE = 0
# setsession : 一个可选的SQL命令列表用于准备每个会话,如["set datestyle to german", ...]
DB_SET_SESSION = None
# creator : 使用连接数据库的模块
DB_CREATOR = pymysql
#其他的配置文件
from DBUtils.PooledDB import PooledDB
from down import db_config as config
"""
@功能:创建数据库连接池
"""
class MyConnectionPool(object):
__pool = None
# def __init__(self):
# self.conn = self.__getConn()
# self.cursor = self.conn.cursor()
# 创建数据库连接conn和游标cursor
def __enter__(self):
self.conn = self.__getconn()
self.cursor = self.conn.cursor()
# 创建数据库连接
def __getconn(self):
if self.__pool is None:
self.__pool = PooledDB(
creator=config.DB_CREATOR,
mincached=config.DB_MIN_CACHED,
maxcached=config.DB_MAX_CACHED,
maxshared=config.DB_MAX_SHARED,
maxconnections=config.DB_MAX_CONNECYIONS,
blocking=config.DB_BLOCKING,
maxusage=config.DB_MAX_USAGE,
setsession=config.DB_SET_SESSION,
host=config.DB_TEST_HOST,
port=config.DB_TEST_PORT,
user=config.DB_TEST_USER,
passwd=config.DB_TEST_PASSWORD,
db=config.DB_TEST_DBNAME,
#use_unicode=False,
charset=config.DB_CHARSET
)
return self.__pool.connection()
# 释放连接池资源
def __exit__(self, exc_type, exc_val, exc_tb):
self.cursor.close()
self.conn.close()
# 关闭连接归还给链接池
# def close(self):
# self.cursor.close()
# self.conn.close()
# 从连接池中取出一个连接
def getconn(self):
conn = self.__getconn()
cursor = conn.cursor()
return cursor, conn
# 获取连接池,实例化
def get_my_connection():
return MyConnectionPool()
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
import requests
import random
import os
from down import db_config as config
import logging
import time
from down.mysqlhelper import MySqLHelper
#这个是创建一个先进先出的队列
def test_queue():
q=Queue(10)
#应该是先查出库来,然后我们根据库的大小定义队列的大小
for i in range(5):
q.put(i)
while not q.empty():
print(q.get())
#这是创建一个先进后出的队列
def test_LifoQueue():
import queue
# queue.LifoQueue() #后进先出->堆栈
q = queue.LifoQueue(3)
q.put(1)
q.put(2)
q.put(3)
print(q.get())
print(q.get())
print(q.get())
#这里创建的一个优先队列
def test_PriorityQueue():
import queue
# queue.PriorityQueue() #优先级
q = queue.PriorityQueue(3) # 优先级,优先级用数字表示,数字越小优先级越高
q.put((10, 'a'))
q.put((-1, 'b'))
q.put((100, 'c'))
print(q.get())
print(q.get())
print(q.get())
#包括创建文件夹,创建文件
def muluwenjian(filePath):
path = filePath[0:filePath.rfind('/')]
isExists = os.path.exists(path)
if not isExists:
#如果不存在文件夹则创建
os.makedirs(path)
isFileExists=os.path.exists(filePath)
if isFileExists:
#如果存在文件则删除
os.remove(filePath)
fp=open(filePath,'wb')
#fp=open('ed_')
return fp
def download(mes):
try:
url=mes[1]
filePath=config.WENJIANDIZHI+mes[2]
filePath=str(filePath)
#filePath='C:/f/2020/efds/esft/egt_'
header = {
"User-Agent": config.USER_AGENT_LIST[random.randint(1,18)]
}
try:
res=requests.get(url,headers=header)
time.sleep(random.randint(3,6))
except Exception as e:
try:
db.update('update t_policy_file_crawl set crawl_status="2" where id="{}"'.format(str(mes[0])))
except Exception as e:
raise Exception('数据库插入失败--'+str(e)+'update t_policy_file_crawl set crawl_status=2 where id="{}"'.format(str(mes[0])))
raise Exception("这里应该是我们的url链接出现了问题\r\n异常原因"+str(e))
#logging.info("这里应该是我们的url链接出现了问题\r\n异常原因"+str(e))
#下面应该加入文件的前缀地址D:什么, 什么的
path = filePath[0:filePath.rfind('/')]
name = filePath[filePath.rfind('/') + 1:]
if res.status_code ==200:
fp=muluwenjian(filePath)
fp.write(res.content)
fp.flush()
logging.debug('成功了一个:'+filePath)
try:
db.update('update t_policy_file_crawl set crawl_status="1" where id="{}"'.format(str(mes[0])))
except Exception as e:
raise Exception('数据库插入失败--'+str(e)+'update t_policy_file_crawl set crawl_status=1 where id="{}"'.format(str(mes[0])))
else:
raise Exception('我们的响应状态码不对。这里报错')
except Exception as reason:
logging.debug('下载失败')
logging.debug('url:'+url)
logging.debug('path:'+filePath)
logging.debug(reason)
finally:
logging.debug('----------------')
fp.close()
def btostr(bss):
lis=[]
for bs in bss :
a = []
for b in bs:
if isinstance(b,bytes):
c= b.decode('utf-8')
a.append(c)
else:a.append(b)
lis.append(a)
return lis
if __name__ == '__main__':
LOG_FORMAT = "%(asctime)s %(name)s %(levelname)s %(pathname)s %(message)s " # 配置输出日志格式
DATE_FORMAT = '%Y-%m-%d %H:%M:%S %a ' # 配置输出时间的格式,注意月份和天数不要搞乱了
logging.basicConfig(level=logging.DEBUG,
format=LOG_FORMAT,
datefmt=DATE_FORMAT,
filemode='a',#filemode:和file函数意义相同,指定日志文件的打开模式,'w'或者'a',默认值为’a’,表示日志消息以追加的形式添加到日志文件中。如果设为’w’, 那么每次程序启动的时候都会创建一个新的日志文件;
filename=r"logs/down.log" # 有了filename参数就不会直接输出显示到控制台,而是直接写入文件
)
db = MySqLHelper()
#我在数据库设计得时候再数据库中加了一个下载得字段,没有下载为0,已经下载为1。下面就是我们查出我们没有下载得链接
res = db.selectMany('select id,file_url,file_location from t_policy_file_crawl where crawl_status=0')
res=btostr(res)
with ThreadPoolExecutor(max_workers=3) as executer:
#采用多线程得方式下载文件
executer.map(download,res[0:3])#参数1:下载方法 参数2:需要下载得链接
#executer.map(download, res)
from down.db_dbutils_init import get_my_connection
"""执行语句查询有结果返回结果没有返回0;增/删/改返回变更数据条数,没有返回0"""
class MySqLHelper(object):
def __init__(self):
self.db = get_my_connection() # 从数据池中获取连接
def __new__(cls, *args, **kwargs):
if not hasattr(cls, 'inst'): # 单例
cls.inst = super(MySqLHelper, cls).__new__(cls, *args, **kwargs)
return cls.inst
# 封装执行命令
def execute(self, sql, param=None, autoclose=False):
"""
【主要判断是否有参数和是否执行完就释放连接】
:param sql: 字符串类型,sql语句
:param param: sql语句中要替换的参数"select %s from tab where id=%s" 其中的%s就是参数
:param autoclose: 是否关闭连接
:return: 返回连接conn和游标cursor
"""
cursor, conn = self.db.getconn() # 从连接池获取连接
count = 0
try:
# count : 为改变的数据条数
if param:
count = cursor.execute(sql, param)
else:
count = cursor.execute(sql)
conn.commit()
if autoclose:
self.close(cursor, conn)
except Exception as e:
pass
return cursor, conn, count
# 执行多条命令
# def executemany(self, lis):
# """
# :param lis: 是一个列表,里面放的是每个sql的字典'[{"sql":"xxx","param":"xx"}....]'
# :return:
# """
# cursor, conn = self.db.getconn()
# try:
# for order in lis:
# sql = order['sql']
# param = order['param']
# if param:
# cursor.execute(sql, param)
# else:
# cursor.execute(sql)
# conn.commit()
# self.close(cursor, conn)
# return True
# except Exception as e:
# print(e)
# conn.rollback()
# self.close(cursor, conn)
# return False
# 释放连接
def close(self, cursor, conn):
"""释放连接归还给连接池"""
cursor.close()
conn.close()
# 查询所有
def selectall(self, sql, param=None):
try:
cursor, conn, count = self.execute(sql, param)
res = cursor.fetchall()
return res
except Exception as e:
print(e)
self.close(cursor, conn)
return count
# 查询单条
def selectone(self, sql, param=None):
try:
cursor, conn, count = self.execute(sql, param)
res = cursor.fetchone()
self.close(cursor, conn)
return res
except Exception as e:
print("error_msg:", e.args)
self.close(cursor, conn)
return count
# 查询多条
def selectMany(self, sql, param=None):
try:
cursor, conn, count = self.execute(sql, param)
res = cursor.fetchall()
self.close(cursor, conn)
return res
except Exception as e:
print("error_msg:", e.args)
self.close(cursor, conn)
return count
# 增加
def insertone(self, sql, param):
try:
cursor, conn, count = self.execute(sql, param)
# _id = cursor.lastrowid() # 获取当前插入数据的主键id,该id应该为自动生成为好
conn.commit()
self.close(cursor, conn)
return count
# 防止表中没有id返回0
# if _id == 0:
# return True
# return _id
except Exception as e:
print(e)
conn.rollback()
self.close(cursor, conn)
return count
# 增加多行
def insertmany(self, sql, param):
"""
:param sql:
:param param: 必须是元组或列表[(),()]或((),())
:return:
"""
cursor, conn, count = self.db.getconn()
try:
cursor.executemany(sql, param)
conn.commit()
return count
except Exception as e:
print(e)
conn.rollback()
self.close(cursor, conn)
return count
# 删除
def delete(self, sql, param=None):
try:
cursor, conn, count = self.execute(sql, param)
self.close(cursor, conn)
return count
except Exception as e:
print(e)
conn.rollback()
self.close(cursor, conn)
return count
# 更新
def update(self, sql, param=None):
try:
cursor, conn, count = self.execute(sql, param)
conn.commit()
self.close(cursor, conn)
return count
except Exception as e:
print(e)
conn.rollback()
self.close(cursor, conn)
return count
#
# if __name__ == '__main__':
# db = MySqLHelper()
# # 查询单条
# sql1 = 'select * from userinfo where name=%s'
# args = 'python'
# ret = db.selectone(sql=sql1, param=args)
# print(ret) # (None, b'python', b'123456', b'0')
# 增加单条
# sql2 = 'insert into userinfo (name,password) VALUES (%s,%s)'
# ret = db.insertone(sql2, ('old2','123456'))
# print(ret)
# 增加多条
# sql3 = 'insert into userinfo (name,password) VALUES (%s,%s)'
# li = li = [
# ('123', '456'),
# ('789','147')
# ]
# ret = db.insertmany(sql3,li)
# print(ret)
# 删除
# sql4 = 'delete from userinfo WHERE name=%s'
# args = 'xxxx'
# ret = db.delete(sql4, args)
# print(ret)
# 更新
# sql5 = r'update userinfo set password=%s WHERE name LIKE %s'
# args = ('123456789', '%old%')
# ret = db.update(sql5, args)
# print(ret)
先说明一下,上面的查询数据库代码是我粘贴别人的,但是我在他的代码上做一些修改,他的增删改查不适合我,我做了一些修改。原作者也找不到了,所以这里就没有发出原文链接。