计算两个URL的相似度 编辑距离和docsim

在教育领域,追踪学习者的学习行为活动是分析学习者学习的一种有效的处理方式,这里处理一批url,通过处理URL形成相似度矩阵,再进一步进行聚类,及以后的相关处理。
计算两个文本间(这里的文本指两个url)的相似度有多种方法,在NLP领域一版处理文本文件相似度,常用docsim/doc2vec/LSH比较两个文档之间的相似度,通过jieba分词然后使用上面的相关算法计算某一句话或者某一段话在文本中和它相似度最高的topN,并计算相似度。这里用这种方式不是很适合,因为url文本中各个词先后顺序是重要的。经过搜索资料发现levenshtein距离符合这类问题的求解。所以使用levenshtein距离求解。

levenshtein距离求解

def Edit_distance_str(str1, str2):
    import Levenshtein
    edit_distance_distance = Levenshtein.distance(str1, str2)
    similarity = 1-(edit_distance_distance/max(len(str1), len(str2)))
    return {'Distance': edit_distance_distance, 'Similarity': similarity}
# Levenshtein distance,编辑距离,用于计算两个字符串之间的编辑距离,传入参数为两个字符串
 
 
def Edit_distance_array(str_ary1, str_ary2):
    len_str_ary1 = len(str_ary1) + 1
    len_str_ary2 = len(str_ary2) + 1
    matrix = [0 for n in range(len_str_ary1 * len_str_ary2)]
    for i in range(len_str_ary1):
        matrix[i] = i
    for j in range(0, len(matrix), len_str_ary1):
        if j % len_str_ary1 == 0:
            matrix[j] = j // len_str_ary1
    for i in range(1, len_str_ary1):
        for j in range(1, len_str_ary2):
            if str_ary1[i-1] == str_ary2[j-1]:
                cost = 0
            else:
                cost = 1
            matrix[j*len_str_ary1+i] = min(matrix[(j-1)*len_str_ary1+i]+1, matrix[j*len_str_ary1+(i-1)]+1, matrix[(j-1)*len_str_ary1+(i-1)] + cost)
    distance = int(matrix[-1])
    similarity = 1-int(matrix[-1])/max(len(str_ary1), len(str_ary2))
    return {'Distance': distance, 'Similarity': similarity}
# 针对列表改写的编辑距离,在NLP领域中,计算两个文本的相似度,是基于句子中词和词之间的差异。
# 如果使用传统的编辑距离算法,则计算的为文本中字与字之间的编辑次数。这里根据编辑距离的思维,
# 将编辑距离中的处理字符串中的字符对象,变成处理list中每个元素

使用正则表达式对url进行切割,这样保持最初的顺序,不需要使用jieba分词

def url_to_arr(url):
    str_arr = re.split(r'\\+|:|/| ',url)
    return str_arr                    

下面介绍一下其他文本的相似度计算,一般有两种方法。gensim和minihash。gensim根据不同场合分wordsim和docsim,背后使用的距离公式应该会余弦相似度;minihash使用的是LSH算法,背后使用的距离公式应该是jaccard距离。这一块还不是很确定,后续会在好好研究下补充进来。

docsim求解:

import gensim
import jieba
import re
# 训练样本
from gensim import corpora
from gensim.similarities import Similarity

def url_similarity(url1,url2):
    #global raw_documents_url
    raw_documents_url = [url1,url2]
    #print(raw_documents_url)
    corpora_documents = []
    for item in raw_documents_url:
        #item = jieba.lcut(item)
        item_str = re.split(r'/', item)
        #item = re.sub('[^a-zA-Z]', '', item)
        #item = item.lower()
        #print(item_str)
        corpora_documents.append(item_str)
    # 生成字典和向量语料
    dictionary = corpora.Dictionary(corpora_documents)
    #print(dictionary)
    #print(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    #num_features代表生成的向量的维数(根据词袋的大小来定)
    similarity = Similarity('Similarity-index', corpus, num_features=400)

    test_data_1 = url1
    test_cut_raw_1 =  re.split(r'/', test_data_1)
    test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    similarity.num_best = 5
    print(similarity[test_corpus_1][1])
    #print(similarity[test_corpus_1])# 返回最相似的样本材料,(index_of_document, similarity) tuples
    #print(test_corpus_1)

单纯使用余弦相似度求解:

import math
import re
import datetime
import time

text1 = "/courses/course-v1:TsinghuaX+80512073X+2018_T1/xblock/block-v1:TsinghuaX+80512073X+2018_T1+type@video+block@6d693bf2712747c3881d928fd32fa5ba/handler/transcript/download"
text2 = "/courses/course-v1:TsinghuaX+80512073X+2018_T1/courseware/4b79872dd89a4bc79623a1a19ef2dc3d/ce004d72c2e2450a8aff54e869af5d50/"

def compute_cosine(text_a, text_b):
    # 找单词及词频
    words1 = text_a.split('/')
    words2 = text_b.split('/')
    # print(words1)
    words1_dict = {}
    words2_dict = {}
    for word in words1:
        # word = word.strip(",.?!;")
        word = re.sub('[^a-zA-Z]', '', word)
        word = word.lower()
        # print(word)
        if word != '' and word in words1_dict: 
            num = words1_dict[word]
            words1_dict[word] = num + 1
        elif word != '':
            words1_dict[word] = 1
        else:
            continue
    for word in words2:
        # word = word.strip(",.?!;")
        word = re.sub('[^a-zA-Z]', '', word)
        word = word.lower()
        if word != '' and word in words2_dict:
            num = words2_dict[word]
            words2_dict[word] = num + 1
        elif word != '':
            words2_dict[word] = 1
        else:
            continue
    print(words1_dict)
    print(words2_dict)

    # 排序
    dic1 = sorted(words1_dict.items(), key=lambda asd: asd[1], reverse=True)
    dic2 = sorted(words2_dict.items(), key=lambda asd: asd[1], reverse=True)
    print(dic1)
    print(dic2)

    # 得到词向量
    words_key = []
    for i in range(len(dic1)):
        words_key.append(dic1[i][0])  # 向数组中添加元素
    for i in range(len(dic2)):
        if dic2[i][0] in words_key:
            # print 'has_key', dic2[i][0]
            pass
        else:  # 合并
            words_key.append(dic2[i][0])
    # print(words_key)
    vect1 = []
    vect2 = []
    for word in words_key:
        if word in words1_dict:
            vect1.append(words1_dict[word])
        else:
            vect1.append(0)
        if word in words2_dict:
            vect2.append(words2_dict[word])
        else:
            vect2.append(0)
    print(vect1)
    print(vect2)
    print(words_key)

    # 计算余弦相似度
    sum = 0
    sq1 = 0
    sq2 = 0
    for i in range(len(vect1)):
        sum += vect1[i] * vect2[i]
        sq1 += pow(vect1[i], 2)
        sq2 += pow(vect2[i], 2)
    try:
        result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)
    except ZeroDivisionError:
        result = 0.0
    return result


if __name__ == '__main__':
    result=compute_cosine(text1, text2)
    print(result)

你可能感兴趣的:(数据挖掘)