# -*- coding: utf-8 -*-
# @Time : 2019/8/14 14:42
# @Author :
# common包下的 py_ac.py
from collections import defaultdict
class TrieNode(object):
def __init__(self, value=None):
# 值
self.value = value
# fail指针
self.fail = None
# 尾标志:标志为i表示第i个模式串串尾,默认为0
self.tail = 0
# 子节点,{value:TrieNode}
self.children = {}
class Trie(object):
def __init__(self, words):
# 根节点
self.root = TrieNode()
# 模式串个数
self.count = 0
self.words = words
for word in words:
self.insert(word)
self.ac_automation()
def insert(self, sequence):
"""
基操,插入一个字符串
:param sequence: 字符串
:return:
"""
self.count += 1
cur_node = self.root
for item in sequence:
if item not in cur_node.children:
# 插入结点
child = TrieNode(value=item)
cur_node.children[item] = child
cur_node = child
else:
cur_node = cur_node.children[item]
cur_node.tail = self.count
def ac_automation(self):
"""
构建失败路径
:return:
"""
queue = [self.root]
# BFS遍历字典树
while len(queue):
temp_node = queue[0]
# 取出队首元素
queue.remove(temp_node)
for value in temp_node.children.values():
# 根的子结点fail指向根自己
if temp_node == self.root:
value.fail = self.root
else:
# 转到fail指针
p = temp_node.fail
while p:
# 若结点值在该结点的子结点中,则将fail指向该结点的对应子结点
if value.value in p.children:
value.fail = p.children[value.value]
break
# 转到fail指针继续回溯
p = p.fail
# 若为None,表示当前结点值在之前都没出现过,则其fail指向根结点
if not p:
value.fail = self.root
# 将当前结点的所有子结点加到队列中
queue.append(value)
def search(self, text):
"""
模式匹配
:param self:
:param text: 长文本
:return:
"""
p = self.root
# 记录匹配起始位置下标
start_index = 0
# 成功匹配结果集
rst = defaultdict(list)
for i in range(len(text)):
single_char = text[i]
while single_char not in p.children and p is not self.root:
p = p.fail
# 有一点瑕疵,原因在于匹配子串的时候,若字符串中部分字符由两个匹配词组成,此时后一个词的前缀下标不会更新
# 这是由于KMP算法本身导致的,目前与下文循环寻找所有匹配词存在冲突
# 但是问题不大,因为其标记的位置均为匹配成功的字符
if single_char in p.children and p is self.root:
start_index = i
# 若找到匹配成功的字符结点,则指向那个结点,否则指向根结点
if single_char in p.children:
p = p.children[single_char]
else:
start_index = i
p = self.root
temp = p
while temp is not self.root:
# 尾标志为0不处理,但是tail需要-1从而与敏感词字典下标一致
# 循环原因在于,有些词本身只是另一个词的后缀,也需要辨识出来
if temp.tail:
rst[self.words[temp.tail - 1]].append((start_index, i))
temp = temp.fail
return rst
def main():
test_words = ["air rifle scopes"]
test_text = """Choosing from among the dozens of air rifle scopes. A scope's job is to magnify an image
Air rifle scopes are built to withstand the double recoil and the vibrations of the rifle.
Variable air rifle scopes operate differently than the fixed scope.
Air rifle scopes are an important addition to a shooter's air rifle and can really make a difference in the accuracy and quality of the shooting experience. Knowing what the differences are between air rifle scopes helps a shooter determine which will work best with their air rifle and their individual shooting style."""
model = Trie(test_words)
print(dict(model.search(test_text.lower())))
if __name__ == "__main__":
main()
# -*- coding: utf-8 -*-
# @Time : 2019/7/31 15:52
# @Author :
# common包中的 __init__.py
import os
import json
def set_json_data(filename, data):
"""将数据写入json文件中"""
path = os.path.dirname(filename)
if not os.path.exists(path):
os.makedirs(path)
with open(filename, 'w', encoding='utf-8') as f:
f.write(json.dumps(data))
def get_json_data(filename):
"""从json文件中获取相应的数据"""
is_exit = os.path.exists(filename)
if not is_exit:
set_json_data(filename, data=0)
with open(filename, 'r', encoding='utf-8') as f:
response = f.read()
return json.loads(response)
def get_id(filename):
"""获取id"""
filename = os.path.dirname(os.path.abspath(__file__)) + '/store/' + os.path.basename(filename).replace('.py', '.json')
last_pid = get_json_data(filename)
return last_pid
def set_id(filename,last_pid):
"""设置id"""
filename = os.path.dirname(os.path.abspath(__file__)) + '/store/' + os.path.basename(filename).replace('.py', '.json')
set_json_data(filename, last_pid)
if __name__ == '__main__':
pass
# -*- coding: utf-8 -*-
# @author:六柒
# @time :2019-09-06 08:36:03
from traceback import format_exc
import eventlet
import pymysql
from DBUtils.PooledDB import PooledDB
from pymysql import escape_string
class MysqlHelper(object):
"""mysql读写封装"""
def __init__(self, host='localhost', user='root', password='123456789',
database='adwords', port=3306,
charset='utf8'):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
self.charset = charset
def connect(self):
"""连接
# 通过创建数据库连接池来创建数据库连接对象的方式解决数据库性能问题
"""
self.__pool = PooledDB(
creator=pymysql, # 使用链接数据库的模块
maxconnections=6, # 连接池允许的最大连接数,0和None表示不限制连接数
mincached=2, # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
maxcached=5, # 链接池中最多闲置的链接,0和None不限制
maxshared=3,
# 链接池中最多共享的链接数量,0和None表示全部共享。PS: 无用,因为pymysql和MySQLdb等模块的 threadsafety都为1,所有值无论设置为多少,_maxcached永远为0,所以永远是所有链接都共享。
blocking=True, # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
maxusage=None, # 一个链接最多被重复使用的次数,None表示无限制
setsession=[], # 开始会话前执行的命令列表。如:["set datestyle to ...", "set time zone ..."]
ping=0,
# ping MySQL服务端,检查是否服务可用。# 如:0 = None = never, 1 = default = whenever it is requested, 2 = when a cursor is created, 4 = when a query is executed, 7 = always
host=self.host,
port=self.port,
user=self.user,
password=self.password,
database=self.database,
charset=self.charset,
autocommit=True
)
conn = self.__pool.connection()
cursor = conn.cursor(pymysql.cursors.DictCursor)
return conn, cursor
def __edit(self, sql, params=None):
while True:
try:
conn, cursor = self.connect()
count = 0
with eventlet.Timeout(30, False):
conn.ping(reconnect=True)
cursor.execute(sql, params)
conn.commit()
count += 1
if count != 0:
return True
else:
continue
except Exception as e:
print(e)
return False
def execute(self, sql, params=None):
"""执行sql语句操作"""
return self.__edit(sql, params=params)
def escape(self, m_str):
"""
mysql转义
:param str:
:return:
"""
new_str = pymysql.escape_string(m_str)
return new_str
# 查
def get_all(self, sql, params=()):
"""获取所有相关的数据"""
try:
conn, cursor = self.connect()
cursor.execute(sql, params)
r = cursor.fetchall()
except Exception as e:
r = []
print(e)
return r
def get_last_id(self):
"""
获取数据插入后的id
:return:
"""
result = self.get_all('SELECT LAST_INSERT_ID();')
return result
# 增
def just_insert(self, table, **kwargs):
"""
插入
:param table:
:param kwargs:
:return:
"""
table = table
keys = ','.join(kwargs.keys())
values = ','.join(['%s'] * len(kwargs))
sql = 'INSERT INTO {table}({keys})values ({values})'.format(table=table, keys=keys, values=values)
result = self.execute(sql, tuple(kwargs.values()))
return result
def file_insert(self, table, **kwargs):
table = table
keys = ','.join(kwargs.keys())
values = ','.join(['%r'] * len(kwargs))
sql = 'INSERT ignore INTO {table}({keys})values ({values});'.format(table=table, keys=keys, values=values)
with open('%s.sql' % table, 'a', encoding='utf-8') as f:
f.write(sql % tuple(kwargs.values()) + '\n')
print('file_insert_success')
# 改
def update(self, sql):
"""
:param table:
:param kwargs:
:return:
"""
result = self.execute(sql)
print('update:', result)
# 存在情况下进行修改
def insert_or_update(self, table, **kwargs):
"""
插入更新:数据值不能为整数,否则报错
:param table: 表名
:param kwargs:
:return:
"""
table = table
keys = ','.join(kwargs.keys())
values = ','.join(['{}'] * len(kwargs))
sql = 'INSERT INTO {table}({keys})values ({values})ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys,
values=values)
update = ','.join(['{key}'.format(key=key) + '= {}' for key in kwargs])
sql += update
# 避免数据库因为单双引号报错的问题,做个正形判断
values = ["'%s'"%escape_string(i) if type(i) != int else i for i in list(kwargs.values())]
sql = sql.format(*tuple(values) * 2)+';'
result = self.execute(sql)
print('mysql_insert_or_update:', result)
return result
def file_insert_or_update(self, table, **kwargs):
"""
插入更新:数据值不能为整数,否则报错
:param table: 表名
:param kwargs:
:return:
"""
table = table
keys = ','.join(kwargs.keys())
values = ','.join(['{}'] * len(kwargs))
sql = 'INSERT INTO {table}({keys})values ({values})ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys,
values=values)
update = ','.join(['{key}'.format(key=key) + '= {}' for key in kwargs])
sql += update
# 避免数据库因为单双引号报错的问题,做个正形判断
values = ["'%s'"%escape_string(i) if type(i) != int else i for i in list(kwargs.values())]
sql = sql.format(*tuple(values) * 2)+';'
with open('%s.sql' % table, 'a', encoding='utf-8') as f:
f.write(sql + '\n')
return 'Successful'
参考链接 https://blog.csdn.net/danengbinggan33/article/details/83338789