介绍
今日从AINLP的公众号上面看到一个推送,关于短文本聚类的工具,刚好公司项目有用到短文本聚类,就进去看看。项目不复杂,使用了jaccard相似度量方法,当然也可以修改度量办法。其特点是内存友好,对于大批量的数据聚类有帮助。
代码解析
cluster.py
cluster.py文件的main方法是工具的主入口
def main():
args = _get_parser()
# preliminary work
check_file(args.infile)
ensure_dir(args.output)
# ==检查并确保输入文件和输出文件夹的存在
if args.name_len_update:
line_cnt = line_counter(args.infile)
args.name_len = len(str(line_cnt)) + 1
# ==删除输出文件夹中与聚类输出文件名模式相匹配的文件
clean_dir(args.output, args.name_len)
# end preliminary work
p_bucket = defaultdict(list)
save_idx = 0
id_name = '{0:0' + str(args.name_len) + 'd}'
# load stop words
# ==载入停用词
stop_words = get_stop_words(args.stop_words) if os.path.exists(args.stop_words) else list()
# load tokenizer
# ==建立分词类,分词有两种,支持中英文
seg = Segmentor(args)
print('Splitting sentence into different clusters ...')
infile = open(args.infile, 'r', encoding="utf-8")
for line in tqdm(infile):
# 遍历输入文件的每一行
line = line.rstrip()
is_match = False
# 分词并去除停用词
seg_list = list(seg.cut(line))
if stop_words:
seg_list = list(filter(lambda x: x not in stop_words, seg_list))
for wd in seg_list:
# ==p_bucket是一个根据词汇来查找已有聚类簇的列表,记录的列表名
w_bucket = p_bucket[wd]
# is_match = False
for bucket in w_bucket:
# == 如果某个聚类簇包含该词汇,那么就去该列表下面寻找已经有的样本行,不足五行,就全选,反之采样五行。
bucket_path = os.path.join(args.output, bucket)
check_file(bucket_path)
selected = sample_file(bucket_path, args.sample_number)
# == 将选取的样本进行分词并去除通用词
selected = list(map(lambda x: list(seg.cut(x)), selected))
if stop_words:
filt_selected = list()
for sen in selected:
sen = list(filter(lambda x: x not in stop_words, sen))
filt_selected.append(sen)
selected = filt_selected
# ==如果待分析样本与采样选取的每个样本都满足jaccard系数大于阈值,就归入该聚类簇
if all(jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected):
is_match = True
with open(bucket_path, 'a', encoding='utf-8') as outfile:
outfile.write(line+'\n')
break
# == 这里应该是一个不足之处,因为如果在这里就进行判断的话,无法对不同顺序单词组成的句子进行聚类
if not is_match:
bucket_name = ('tmp' + id_name).format(save_idx)
w_bucket.append(bucket_name)
bucket_path = os.path.join(args.output, bucket_name)
with open(bucket_path, 'a', encoding='utf-8') as outfile:
outfile.write(line+'\n')
save_idx += 1
break
infile.close()
# sort and rename file
file_list = os.listdir(args.output)
file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
cnt = dict()
for file in file_list:
file_path = os.path.join(args.output, file)
cnt[file] = line_counter(file_path)
sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
for idx, (file_name, times) in enumerate(sorted_cnt):
origin_path = os.path.join(args.output, file_name)
new_path = os.path.join(args.output, id_name.format(idx))
os.rename(origin_path, new_path)
print('All is well')
评价
优点:内存友好
缺点:项目有个bug,无法将不同顺序单词组成的句子归到同一类中,建议加入原有的聚类簇之后,不要break,只是不要再进行聚类类别判定或者新建,而对该样本的每个单词的p_bucket列表都加入该类,同时需要调整新建类别的逻辑,如果第一个单词匹配不通过,建立一个tmp词汇,将该单词放入,然后对下一个单词进行判定,直到所有的都不通过,才建立新的类别,并将tmp词汇里的p_bucket都加上这个新的类别。
项目地址
原始项目地址:(已经提了个issue,可能会修改这个问题
https://github.com/RandyPen/TextCluster
有bug的项目fork了一份:
https://github.com/612yese/TextCluster
追加一
def main():
args = _get_parser()
# preliminary work
check_file(args.infile)
ensure_dir(args.output)
if args.name_len_update:
line_cnt = line_counter(args.infile)
args.name_len = len(str(line_cnt)) + 1
clean_dir(args.output, args.name_len)
# end preliminary work
p_bucket = defaultdict(list)
save_idx = 0
id_name = '{0:0' + str(args.name_len) + 'd}'
# load stop words
stop_words = get_stop_words(args.stop_words) if os.path.exists(args.stop_words) else list()
# load tokenizer
seg = Segmentor(args)
print('Splitting sentence into different clusters ...')
infile = open(args.infile, 'r', encoding="utf-8")
for line in tqdm(infile):
line = line.rstrip()
is_match = False
seg_list = list(seg.cut(line))
if stop_words:
seg_list = list(filter(lambda x: x not in stop_words, seg_list))
for wd in seg_list:
################# 追加之处 begin ####################
if is_match:
break
################# 追加之处 end ####################
w_bucket = p_bucket[wd]
for bucket in w_bucket:
bucket_path = os.path.join(args.output, bucket)
check_file(bucket_path)
selected = sample_file(bucket_path, args.sample_number)
selected = list(map(lambda x: list(seg.cut(x)), selected))
# remove stop words
if stop_words:
filt_selected = list()
for sen in selected:
sen = list(filter(lambda x: x not in stop_words, sen))
filt_selected.append(sen)
selected = filt_selected
# calculate similarity with each bucket
if all(jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected):
is_match = True
with open(bucket_path, 'a', encoding='utf-8') as outfile:
outfile.write(line+'\n')
################# 追加之处 begin ####################
for w in seg_list:
if bucket not in p_bucket[w]:
p_bucket[w].append(bucket)
################# 追加之处 end ####################
break
################追加改变,缩进改变##################
if not is_match:
bucket_name = ('tmp' + id_name).format(save_idx)
bucket_path = os.path.join(args.output, bucket_name)
with open(bucket_path, 'a', encoding='utf-8') as outfile:
outfile.write(line+'\n')
################# 追加之处 begin ####################
for w in seg_list:
p_bucket[w].append(bucket_name)
################# 追加之处 end ####################
save_idx += 1
infile.close()
# sort and rename file
file_list = os.listdir(args.output)
file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
cnt = dict()
for file in file_list:
file_path = os.path.join(args.output, file)
cnt[file] = line_counter(file_path)
sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
for idx, (file_name, times) in enumerate(sorted_cnt):
origin_path = os.path.join(args.output, file_name)
new_path = os.path.join(args.output, id_name.format(idx))
os.rename(origin_path, new_path)
print('All is well')
作者的追加显然是可以解决该问题的,将新增的聚类逻辑放大了单词for循环之外,对于已有聚类簇,也将该类加入到所有单词的bucket中。