以下举例以同一个excel中, sheet2的词语去匹配sheet1中词语找模糊匹配结果来举例
导入数据,读取excel中sheet1(被匹配的目标词库),sheet2(需要进行匹配的词)
import pandas as pd
import jieba
#需要进行匹配的词
attendee = pd.read_excel('路径/testnn.xlsx',sheet_name='Sheet2')
#被匹配的目标词库
account = pd.read_excel('路径/testnn.xlsx',sheet_name='Sheet1')
attendee = attendee.values
account = account.values
#print(attendee)
#print(account)
…………………………………………………………
把需要匹配的词语和目标词语做分词,对比分词匹配度判定关联关系
1、导入jieba分词包,对目标词和待匹配词进行分词,并将其导入至新字典中
#需要进行匹配的词的分词结果字典
Sheet2 = {}
for i in attendee:
HCO=[]
temp = jieba.cut(i[0], cut_all=False)
for a in temp:
HCO.append(a)
Sheet2[i[0]] = HCO
#print(Sheet2)
#被匹配的目标词库的分词结果字典
Sheet1 = {}
for i in account:
HCO = []
temp = jieba.cut(i[0], cut_all=False)
for a in temp:
HCO.append(a)
Sheet1[i[0]] = HCO
#print(Sheet1)
结果:
2、遍历分词后结果字典,对比相同的关键词并记录匹配情况
for i in Sheet1:
a = i
if i in Sheet2:
#如果名称完全相同则返回名称
resultstr = i
#如果名称不完全相同,对比分词后的词语
for j in Sheet2:
b = j
#需要进行匹配的词的分词数量
origin_num = 0
#两分词结果中匹配成功的分词词语数量
match_num = 0
#存储需要进行匹配的词的分词结果
origin_l=[]
for k in Sheet1[i] :
# xxxx代表分词结果中需要人工判定排除的异常词
if k != 'xxxx':
c = k
origin_l.append(k)
origin_num = origin_num +1
target_l = []
target_num = 0
for h in Sheet2[j] :
# xxxx代表分词结果中需要人工判定排除的异常词
if h != 'xxxx':
d = h
target_num = target_num +1
target_l.append(h)
if c == d:
match_num = match_num + 1
#选取符合条件的结果输出,每条词语对应一条结果
if match_num > origin_num - match_num:
data = {'origin_str': a, 'target_str': b, 'origin_l': origin_l, 'target_l': target_l,'origin_num': origin_num, 'target_num':target_num, 'match_num':match_num}
print(data)
import pandas as pd
import jieba
#需要进行匹配的词
attendee = pd.read_excel('路径/testnn.xlsx',sheet_name='Sheet2')
#被匹配的目标词库
account = pd.read_excel('路径/testnn.xlsx',sheet_name='Sheet1')
attendee = attendee.values
account = account.values
#print(attendee)
#print(account)
Sheet2 = {}
for i in attendee:
HCO=[]
temp = jieba.cut(i[0], cut_all=False)
for a in temp:
HCO.append(a)
Sheet2[i[0]] = HCO
#print(Sheet2)
#被匹配的目标词库的分词结果字典
Sheet1 = {}
for i in account:
HCO = []
temp = jieba.cut(i[0], cut_all=False)
for a in temp:
HCO.append(a)
Sheet1[i[0]] = HCO
#print(Sheet1)
for i in Sheet1:
a = i
if i in Sheet2:
#如果名称完全相同则返回名称
resultstr = i
#如果名称不完全相同,对比分词后的词语
for j in Sheet2:
b = j
#需要进行匹配的词的分词数量
origin_num = 0
#两分词结果中匹配成功的分词词语数量
match_num = 0
#存储需要进行匹配的词的分词结果
origin_l=[]
for k in Sheet1[i] :
# xxxx代表分词结果中需要人工判定排除的异常词
if k != 'xxxx':
c = k
origin_l.append(k)
origin_num = origin_num +1
target_l = []
target_num = 0
for h in Sheet2[j] :
# xxxx代表分词结果中需要人工判定排除的异常词
if h != 'xxxx':
d = h
target_num = target_num +1
target_l.append(h)
if c == d:
match_num = match_num + 1
#选取符合条件的结果输出
if match_num > origin_num - match_num:
data = {'origin_str': a, 'target_str': b, 'origin_l': origin_l, 'target_l': target_l,'origin_num': origin_num, 'target_num':target_num, 'match_num':match_num}
print(data)
调用fuzzywuzzy包中直接进行判断,采用距离匹配方式
两个字符串之间,由一个转成另一个所需的最少编辑操作次数。
编辑操作包括:将一个字符替换成另一个字符,插入字符,删除字符。
一般来说,编辑距离越小,两个串的相似度越大
整体代码
import pandas as pd
import jieba
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
attendee = pd.read_excel('路径/testnn.xlsx',sheet_name='Sheet2')
account = pd.read_excel('路径/testnn.xlsx',sheet_name='Sheet1')
attendee = attendee.values
account = account.values
Sheet2 = {}
for i in attendee:
HCO=[]
temp = jieba.cut(i[0], cut_all=False)
for a in temp:
HCO.append(a)
Sheet2[i[0]] = HCO
print(Sheet2)
Sheet1 = {}
for i in account:
HCO = []
temp = jieba.cut(i[0], cut_all=False)
for a in temp:
HCO.append(a)
Sheet1[i[0]] = HCO
print(Sheet1)
target_l = []
data = []
n = 0
for j in Sheet2:
target_l.append(j)
for i in Sheet1:
n = n+1
target= {'搜索公司':i,'目标公司': process.extractOne( i, target_l )[0],'目标权重': process.extractOne( i, target_l )[1]}
data.append(target)
print (data)
df1 = pd.DataFrame(data)
print(df1)
writer = pd.ExcelWriter('路径/testmm.xlsx')
df1.to_excel(writer, 'Final')
writer.save()
writer.close()