ac自动机

ac自动机

# -*- coding: utf-8 -*-
# @Time    : 2019/8/14 14:42
# @Author  :
# common包下的 py_ac.py

from collections import defaultdict


class TrieNode(object):
    def __init__(self, value=None):
        # 值
        self.value = value
        # fail指针
        self.fail = None
        # 尾标志:标志为i表示第i个模式串串尾,默认为0
        self.tail = 0
        # 子节点,{value:TrieNode}
        self.children = {}


class Trie(object):
    def __init__(self, words):
        # 根节点
        self.root = TrieNode()
        # 模式串个数
        self.count = 0
        self.words = words
        for word in words:
            self.insert(word)
        self.ac_automation()

    def insert(self, sequence):
        """
        基操,插入一个字符串
        :param sequence: 字符串
        :return:
        """
        self.count += 1
        cur_node = self.root
        for item in sequence:
            if item not in cur_node.children:
                # 插入结点
                child = TrieNode(value=item)
                cur_node.children[item] = child
                cur_node = child
            else:
                cur_node = cur_node.children[item]
        cur_node.tail = self.count

    def ac_automation(self):
        """
        构建失败路径
        :return:
        """
        queue = [self.root]
        # BFS遍历字典树
        while len(queue):
            temp_node = queue[0]
            # 取出队首元素
            queue.remove(temp_node)
            for value in temp_node.children.values():
                # 根的子结点fail指向根自己
                if temp_node == self.root:
                    value.fail = self.root
                else:
                    # 转到fail指针
                    p = temp_node.fail
                    while p:
                        # 若结点值在该结点的子结点中,则将fail指向该结点的对应子结点
                        if value.value in p.children:
                            value.fail = p.children[value.value]
                            break
                        # 转到fail指针继续回溯
                        p = p.fail
                    # 若为None,表示当前结点值在之前都没出现过,则其fail指向根结点
                    if not p:
                        value.fail = self.root
                # 将当前结点的所有子结点加到队列中
                queue.append(value)

    def search(self, text):
        """
        模式匹配
        :param self:
        :param text: 长文本
        :return:
        """
        p = self.root
        # 记录匹配起始位置下标
        start_index = 0
        # 成功匹配结果集
        rst = defaultdict(list)
        for i in range(len(text)):
            single_char = text[i]
            while single_char not in p.children and p is not self.root:
                p = p.fail
            # 有一点瑕疵,原因在于匹配子串的时候,若字符串中部分字符由两个匹配词组成,此时后一个词的前缀下标不会更新
            # 这是由于KMP算法本身导致的,目前与下文循环寻找所有匹配词存在冲突
            # 但是问题不大,因为其标记的位置均为匹配成功的字符
            if single_char in p.children and p is self.root:
                start_index = i
            # 若找到匹配成功的字符结点,则指向那个结点,否则指向根结点
            if single_char in p.children:
                p = p.children[single_char]
            else:
                start_index = i
                p = self.root
            temp = p
            while temp is not self.root:
                # 尾标志为0不处理,但是tail需要-1从而与敏感词字典下标一致
                # 循环原因在于,有些词本身只是另一个词的后缀,也需要辨识出来
                if temp.tail:
                    rst[self.words[temp.tail - 1]].append((start_index, i))
                temp = temp.fail
        return rst


def main():

    test_words = ["air rifle scopes"]
    test_text = """Choosing from among the dozens of air rifle scopes. A scope's job is to magnify an image
 Air rifle scopes are built to withstand the double recoil and the vibrations of the rifle. 
 Variable air rifle scopes operate differently than the fixed scope. 
 Air rifle scopes are an important addition to a shooter's air rifle and can really make a difference in the accuracy and quality of the shooting experience. Knowing what the differences are between air rifle scopes helps a shooter determine which will work best with their air rifle and their individual shooting style."""
    model = Trie(test_words)
    print(dict(model.search(test_text.lower())))


if __name__ == "__main__":
    main()

记录下标

# -*- coding: utf-8 -*-
# @Time    : 2019/7/31 15:52
# @Author  :
# common包中的 __init__.py

import os
import json


def set_json_data(filename, data):
    """将数据写入json文件中"""
    path = os.path.dirname(filename)
    if not os.path.exists(path):
        os.makedirs(path)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(json.dumps(data))


def get_json_data(filename):
    """从json文件中获取相应的数据"""

    is_exit = os.path.exists(filename)
    if not is_exit:
        set_json_data(filename, data=0)
    with open(filename, 'r', encoding='utf-8') as f:
        response = f.read()
    return json.loads(response)


def get_id(filename):
    """获取id"""
    filename = os.path.dirname(os.path.abspath(__file__)) + '/store/' + os.path.basename(filename).replace('.py', '.json')
    last_pid = get_json_data(filename)
    return last_pid


def set_id(filename,last_pid):
    """设置id"""
    filename = os.path.dirname(os.path.abspath(__file__)) + '/store/' + os.path.basename(filename).replace('.py', '.json')
    set_json_data(filename, last_pid)


if __name__ == '__main__':
    pass

mysql封装

# -*- coding: utf-8 -*-
# @author:六柒
# @time  :2019-09-06 08:36:03
from traceback import format_exc
import eventlet
import pymysql
from DBUtils.PooledDB import PooledDB
from pymysql import escape_string






class MysqlHelper(object):

    """mysql读写封装"""

    def __init__(self, host='localhost', user='root', password='123456789',
                 database='adwords', port=3306,
                 charset='utf8'):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.port = port
        self.charset = charset



    def connect(self):
        """连接
        # 通过创建数据库连接池来创建数据库连接对象的方式解决数据库性能问题
        """
        self.__pool = PooledDB(
            creator=pymysql,  # 使用链接数据库的模块
            maxconnections=6,  # 连接池允许的最大连接数,0和None表示不限制连接数
            mincached=2,  # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
            maxcached=5,  # 链接池中最多闲置的链接,0和None不限制
            maxshared=3,
            # 链接池中最多共享的链接数量,0和None表示全部共享。PS: 无用,因为pymysql和MySQLdb等模块的 threadsafety都为1,所有值无论设置为多少,_maxcached永远为0,所以永远是所有链接都共享。
            blocking=True,  # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
            maxusage=None,  # 一个链接最多被重复使用的次数,None表示无限制
            setsession=[],  # 开始会话前执行的命令列表。如:["set datestyle to ...", "set time zone ..."]
            ping=0,
            # ping MySQL服务端,检查是否服务可用。# 如:0 = None = never, 1 = default = whenever it is requested, 2 = when a cursor is created, 4 = when a query is executed, 7 = always
            host=self.host,
            port=self.port,
            user=self.user,
            password=self.password,
            database=self.database,
            charset=self.charset,
            autocommit=True
        )
        conn = self.__pool.connection()
        cursor = conn.cursor(pymysql.cursors.DictCursor)
        return conn, cursor

    def __edit(self, sql, params=None):
        while True:
            try:
                conn, cursor = self.connect()
                count = 0
                with eventlet.Timeout(30, False):
                    conn.ping(reconnect=True)
                    cursor.execute(sql, params)
                    conn.commit()
                    count += 1

                if count != 0:
                    return True
                else:
                    continue

            except Exception as e:
                print(e)
                return False

    def execute(self, sql, params=None):
        """执行sql语句操作"""
        return self.__edit(sql, params=params)

    def escape(self, m_str):
        """
        mysql转义
        :param str:
        :return:
        """
        new_str = pymysql.escape_string(m_str)
        return new_str

    # 查
    def get_all(self, sql, params=()):
        """获取所有相关的数据"""
        try:
            conn, cursor = self.connect()
            cursor.execute(sql, params)
            r = cursor.fetchall()

        except Exception as e:
            r = []
            print(e)

        return r

    def get_last_id(self):
        """
        获取数据插入后的id
        :return:
        """
        result = self.get_all('SELECT LAST_INSERT_ID();')
        return result

    # 增
    def just_insert(self, table, **kwargs):
        """
        插入
        :param table:
        :param kwargs:
        :return:
        """
        table = table
        keys = ','.join(kwargs.keys())
        values = ','.join(['%s'] * len(kwargs))
        sql = 'INSERT INTO {table}({keys})values ({values})'.format(table=table, keys=keys, values=values)
        result = self.execute(sql, tuple(kwargs.values()))
        return result

    def file_insert(self, table, **kwargs):
        table = table
        keys = ','.join(kwargs.keys())
        values = ','.join(['%r'] * len(kwargs))
        sql = 'INSERT ignore INTO {table}({keys})values ({values});'.format(table=table, keys=keys, values=values)
        with open('%s.sql' % table, 'a', encoding='utf-8') as f:
            f.write(sql % tuple(kwargs.values()) + '\n')
        print('file_insert_success')

    # 改
    def update(self, sql):
        """
        :param table:
        :param kwargs:
        :return:
        """
        result = self.execute(sql)
        print('update:', result)

    #  存在情况下进行修改
    def insert_or_update(self, table, **kwargs):
        """
        插入更新:数据值不能为整数,否则报错
        :param table: 表名
        :param kwargs:
        :return:
        """
        table = table
        keys = ','.join(kwargs.keys())
        values = ','.join(['{}'] * len(kwargs))
        sql = 'INSERT INTO {table}({keys})values ({values})ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys,
                                                                                            values=values)
        update = ','.join(['{key}'.format(key=key) + '= {}' for key in kwargs])
        sql += update

        # 避免数据库因为单双引号报错的问题,做个正形判断
        values = ["'%s'"%escape_string(i) if type(i) != int else i for i in list(kwargs.values())]
        sql = sql.format(*tuple(values) * 2)+';'
        result = self.execute(sql)
        print('mysql_insert_or_update:', result)
        return result

    def file_insert_or_update(self, table, **kwargs):
        """
        插入更新:数据值不能为整数,否则报错
        :param table: 表名
        :param kwargs:
        :return:
        """
        table = table
        keys = ','.join(kwargs.keys())
        values = ','.join(['{}'] * len(kwargs))
        sql = 'INSERT INTO {table}({keys})values ({values})ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys,
                                                                                            values=values)
        update = ','.join(['{key}'.format(key=key) + '= {}' for key in kwargs])
        sql += update

        # 避免数据库因为单双引号报错的问题,做个正形判断
        values = ["'%s'"%escape_string(i) if type(i) != int else i for i in list(kwargs.values())]

        sql = sql.format(*tuple(values) * 2)+';'

        with open('%s.sql' % table, 'a', encoding='utf-8') as f:
            f.write(sql + '\n')

        return 'Successful'


参考链接 https://blog.csdn.net/danengbinggan33/article/details/83338789

你可能感兴趣的:(ac自动机)