余弦文本相似度匹配

python连接sqlserver对数据进行文本相似度匹配

from sqlalchemy import create_engine,Table,Column,Date,Integer,String,ForeignKey                                                                                    
from fuzzywuzzy import process                                                                                                                                      
import os                                                                                                                                                           
import pymssql                                                                                                                                                      
import pymysql                                                                                                                                                      
import datetime                                                                                                                                                     
import time                                                                                                                                                         
import sys                                                                                                                                                          
import numpy                                                                                                                                                        
#import text2vec                                                                                                                                                    
from text2vec import Similarity                                                                                                                                     
#from longconv import *                                                                                                                                             
from gensim import similarities                                                                                                                                     
import paramiko                                                                                                                                                     
from smb.SMBConnection import *                                                                                                                                     
import csv                                                                                                                                                          
import re                                                                                                                                                           
import sqlalchemy                                                                                                                                                   
import pandas as pd                                                                                                                                                 
from sqlalchemy import MetaData,create_engine,Table,Integer                                                                                                         
import socket                                                                                                                                                       
import pandas                                                                                                                                                       
#import gradio as gr                                                                                                                                                
import datetime                                                                                                                                                     
from scipy.spatial.distance import pdist                                                                                                                            
from text2vec import Similarity                                                                                                                                     
t1=time.time()                                                                                                                                                      
# 中文句向量模型(CoSENT)                                                                                                                                            
sim_model = Similarity(model_name_or_path='shibing624/text2vec-base-chinese',                                                                                       
                       similarity_type='cosine', embedding_type='sbert')                                                                                            
                                                                                                                                                                  
                                                                                                                                                                    
                           def ai_text(sentence1, sentence2):                                                                                                                                  
    #scores=[]                                                                                                                                                      
    #dict={}                                                                                                                                                        
    score = sim_model.get_score(sentence1, sentence2)                                                                                                               
                                                                                                                                                                    
   # print("{} \t\t {} \t\t Score: {:.4f}".format(sentence1, sentence2, score))                                                                                     
    return score                                                                                                                                                    
    #dict[sentence2] = score                                                                                                                                        
    #print(dict)                                                                                                                                                    
                                                                                                                                                                    
                                                                                                                                                                    
                                                                                                                                                                                                 def db_conn(db_host,db_user,db_passwd,db_database,db_port):                                                                                                         
    conn=pymssql.connect(server=db_host,user=db_user,password=db_passwd,database=db_database,port=db_port)                                                          
    cur=conn.cursor()                                                                                                                                               
    if not cur:                                                                                                                                                     
        raise Exception('数据库连接失败')                                                                                                                           
    else:print('succeed')                                                                                                                                           
    cur.execute("""                                                                                                                                                 
    select * from  ODS_Legal_WorldCheck_Name""")                                                                                                                    
    wc = cur.fetchall()                                                                                                                                             
    cur2 =conn.cursor()                                                                                                                                             
   # cur2.execute('''select top 50 * from ODS_Legal_Ariba_VendorData_New ''')                                                                                       
    cur2.execute(''' select   *   from                                                                                                                              
          (select   *,   ROW_NUMBER()   OVER   (order   by   SMVendorID)   AS   ROWNUM                                                                              
          from ODS_Legal_Ariba_VendorData_New) t                                                                                                                    
           where   ROWNUM   between   1   and   10  ''')                                                                                                            
    av = cur2.fetchall()                                                                                                                                            
    for row1 in av:                                                                                                                                                 
        max_rs = 0                                                                                                                                                  
        for row2 in wc:                                                                                                                                             
            rs = ai_text(row1[1],row2[1])                                                                                                                           
            if max_rs < rs :                                                                                                                                        
                max_rs = rs                                                                                                                                         
                max_rs = round(max_rs,2)                                                                                                                            
                max_id = row2[0]                                                                                                                                    
                max_name=row2[1]                                                                                                                                    
                                                                                                                                                                    
            print(row1[0],row1[1],max_id,max_name,max_rs)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

        t2=time.time()                                                                                                                                              
        print(t2-t1)                                                                                                                                                                                                                                                                                    
if __name__=='__main__':                                                                                                                                            
    db_host='10.111.*.****'                                                                                                                                          
    db_user='***dmin'                                                                                                                                              
    db_passwd = '*******'                                                                                                                                      
    db_port='20001'                                                                                                                                                 
    db_database='ODS_***'                                                                                                                                          
    db_conn(db_host,db_user,db_passwd,db_database,db_port)  

你可能感兴趣的:(SqlServer,python)