如何利用Python实现分词和NER的Web服务

        大家都知道python在数据处理、数据分析和机器学习等方面的强大之处,那么如何使用Python实现Web方式的HTTP应用和服务呢,其实有很多种方式,比如利用gradio、streamlit实现web应用,利用FastApi实现web服务等等,具体详见之前的大模型应用文章,本文采用tornado框架实现http web服务,并结合自然语言处理(NLP)的分词和实体识别的需求,分别实现如何提供相应的http服务。具体如下:
运行环境:python3.10,tornado,jieba,time,logging等
运行命令:python httpServer_nlp.py
调用方法:http://localhost:8082/cutsegment?content=油气勘探开发文档的语义分析及提取方法的研究与实现三个方面研究&search_type=1
识别结果(分词):{"cut": ["油气勘探", "开发", "文档", "语义", "分析", "提取", "方法", "研究", "三个", "研究"], "entities": [], "returncode": 0, "message": "ok", "runtime": 0.3878319263458252}
识别结果(实体):{"cut": [], "entities": ["方法", "语义", "分析", "提取", "实现", "文档", "方面"], "returncode": 0, "message": "ok", "runtime": 0.4005763530731201}

import sys  
import os  
import time  
import tornado.httpserver  
from tornado.options import define, options  
import json  
from collections import OrderedDict  
import re  
import jieba  
from jieba import analyse  
import logging  
# 设置日志记录  
logger = logging.getLogger()  # 获取日志记录器  
logger.setLevel(logging.INFO)  # 设置全局日志输出级别  
# 创建文件日志记录处理器,并指定一些设置选项  
fileHandler = logging.FileHandler(filename=f'./log/service.log', mode='a+', encoding='utf-8', delay=False)  
# 定义日志输出风格(格式器)  
format_option = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(threadName)s - %(levelname)s: %(message)s'  
fileHandler.setFormatter(logging.Formatter(format_option))  
# 将日志记录处理器加入日志对象  
logger.addHandler(fileHandler)  
# logger.info("result:{}".format(output))  # 本地日志记录示例  
  
# 设置端口  
define("port", default=8082, help="--port", type=int)  
# 获取分词结果  
def get_kg_result_0(text):  
    jieba.load_userdict("./data/StopWord/user_dict.txt")  # 加载自定义分词词典  
    # 2 获取停用词  
    stwlist = get_stop_words()  
    text,theDate=get_date(text)  
    # 3 分词并去除停用词  
    out_dict = remove_special_tokens(jieba.cut(text, cut_all=False), stwlist)  
    print('\n\n1.加载自定义分词词典:\n' + "/ ".join(out_dict))  
    return out_dict+theDate    
# 利用TF-IDF获取实体  
def get_entity_0(text):  
    # 抽取前多少的关键词,并按照权重值由高到低返回  
    print('抽取前多少的关键词' + '//')  
    textrank = analyse.textrank  
    keywords2 = textrank(text)  
    print(keywords2)  
    return keywords2    
# 读取停用词  
def get_stop_words(path=r'./data/StopWord/NLPIR_stopwords.txt'):  
    file = open(path, 'r',encoding='utf-8').read().split('\n')  
    return set(file)    
# 去掉一些停用词/单字符/空字符等  
def remove_special_tokens(words, stwlist):  
    words_list = list(words)  
    stop_words = stwlist  
    for i in range(words_list.__len__())[::-1]:  
        if words_list[i] in stop_words: # 去除停用词  
            words_list.pop(i)  
        elif (len(words_list[i]) == 1):  # 去除单个字符  
            words_list.pop(i)  
        elif words_list[i] == " ":  # 去除空字符  
            words_list.pop(i)  
    return words_list  
  
# 整体获取日期,日期需要在分词之前做单独处理  
def get_date(content):  
    pattern = r'\d{4}-\d{1,2}-\d{1,2}|\d{4}年\d{1,2}月\d{1,2}日|\d{4}/\d{1,2}/\d{1,2}'  
    result = re.findall(pattern, content)  
    for item in result:  
        content = content.replace(item, "灥")  # 将日期转换为特殊字符,一般采取不常用的汉字代替  
    return content,result    
# tornado结构,只需要在里面添加代码,规范输出格式即可  
class MainGetHandler(tornado.web.RequestHandler):    
    def recog(self, mode="get"):  
        """ 能够同时支持get和post请求 """        if mode == "get":  
            sub = self.get_argument("content", None)  
            search_type = self.get_argument("search_type", 0)  
            search_type = int(search_type)  
            uid = self.get_argument("uuid", "000000")  
        else:  
            """ post方式接收data传递来的参数 """            data = json.loads(self.request.body.decode())  
            sub = data["content"] if "content" in data else None  
            search_type = int(data["search_type"]) if "search_type" in data else 0  
            uid = data["uuid"] if "uuid" in data else "000000"  
  
        #### 配置参数 ####        result = OrderedDict()  
        returncode = 0  
        message = "ok"  
        output = {}  
        entity={}  
        start = time.time()  
  
        if search_type == 0 or search_type > 4:  
            returncode = 10000  
            message = "search_type is error"  
        if sub is None and rel is None and obj is None:  
            returncode = 10001  
            message = "data is null"  
        if search_type == 1: # 查content的全部relation和object,content不能为空  
            try:  
                if sub == None or sub in [""," "]:  
                    returncode = 10002  
                    message = "when search_type is 1, content not null"  
                else:  
                    output = get_kg_result_0(sub)  
                    entity=[]  
            except Exception as e:  
                logger.info("{},error: {}".format(output))  
                returncode = 10002  
                message = "service error"  
        elif search_type == 2:  
            try:  
                if sub == None or sub in [""," "]:  
                    returncode = 10003  
                    message = "when search_type is 2, content and rel not null"  
                else:  
                    output = []  
                    entity=get_entity_0(sub)  
            except Exception as e:  
                logger.info("{},error: {}".format(entity))  
                returncode = 10003  
                message = "service error"    
        end = time.time()  
        detal = end - start    
        # 以json格式输出,参考输出格式  
        result["cut"] = output  
        result["entities"]=entity #实体识别  
        result["returncode"] = returncode  
        result["message"] = message  
        result["runtime"] = detal    
        logger.info("result:{}".format(result))        # 本地日志  
        self.write(json.dumps(result, ensure_ascii=False))  # 写结果  
        self.finish()    
    def get(self):  
        """ get方式调用 """        self.recog(mode="get")    
    def post(self):  
        """ post方式调用 """        self.recog(mode="post")  
# 主程序  
if __name__ == "__main__":  
    # """ 服务器启动 """    
    print("Server is listening,Port:" + str(options.port) + " ...")  
    sys.path.append("../")  # 将当前目录加载道path中  
    tornado.options.parse_command_line()  
    # 域名规则,需要与nginx中配置的一致  
    application = tornado.web.Application([(r"/cutsegment", MainGetHandler)])  
    http_server = tornado.httpserver.HTTPServer(application)  
    http_server.listen(options.port)  
    tornado.ioloop.IOLoop.instance().start()  

你可能感兴趣的:(数据应用,后端,数据处理,python,http,开发语言)