简易中文分词聚类(Python)

# -*- coding: utf-8 -*-
__author__ = 'Zhao'

import re
import operator

blank = [chr(183)]
tabs = ['']


def tree(lst):
    l = len(lst)
    if l == 0:
        print('─' * 3)
    else:
        for i, j in enumerate(lst):
            if i != 0: print(tabs[0], end='')
            if l == 1:
                s = '─' * 3
            elif i == 0:
                s = '┬' + '─' * 2
            elif i + 1 == l:
                s = '└' + '─' * 2
            else:
                s = '├' + '─' * 2
            print(s, end='')
            if isinstance(j, list) or isinstance(j, tuple):
                if i + 1 == l:
                    tabs[0] += blank[0] * 3
                else:
                    tabs[0] += '│' + blank[0] * 2
                tree(j)
            else:
                print(" ", j)
    tabs[0] = tabs[0][:-3]


def judge_element_delete(list_input, centroid, group, match_num):
    for list_element in list_input:
        if isinstance(list_element, list):
            for element in list_element:
                if element == match_num:
                    del centroid[list_input.index(list_element)]
                    del group[list_input.index(list_element)]
        else:
            if list_element == match_num:
                del centroid[list_input.index(list_element)]
                del group[list_input.index(list_element)]


# --------------- in this part we save the list as list ---------------
path = '/Users/apple/desktop/'
fp = open(path + 'list.txt')
ori = fp.readlines()
# ori is the list with out any operation

copy = []
for x in ori:
    x = re.sub(r'\n', '', x)
    copy.append(x)
# in this part we change the format in a into standard format and save as copy

fp.close()
# we close the file, then we can run the list totally in this program

copy.sort()

# --------------- this part end ---------------

# in this part we know the average length in this list is 2, thus we set step as 5.
# In that case, we can contain at least one word.
# totally, there are 56064 words in this list and only 56 is longer than 5.
# In that case, 5 can be a reasonable step for this program.

# sum = 0
# num = 0
# for x in copy:
#     sum += len(x)
#     num += 1
# average = (int)(sum/num)
# print(average, ' ', num);

# max_lenth = 0
# for x in copy:
#     if max_lenth < len(x):
#         max_lenth = len(x)
#
# print(max_lenth)

# number = 0
# for x in copy:
#     if len(x) > 5:
#         number += 1
#
# print(number)

# --------------- the upper is the calculation in the preparation ---------------

str_input = input("请输入一个段落:\n")

str_input = re.sub(r',', "", str_input)
str_input = re.sub(r',', "", str_input)
str_input = re.sub(r'\.', "", str_input)
str_input = re.sub(r'。', "", str_input)
str_input = re.sub(r'——', "", str_input)
str_input = re.sub(r'……', "", str_input)
str_input = re.sub(r'!', "", str_input)
str_input = re.sub(r'!', "", str_input)
str_input = re.sub(r'\?', "", str_input)
str_input = re.sub(r'?', "", str_input)
str_input = re.sub(r';', "", str_input)
str_input = re.sub(r';', "", str_input)
str_input = re.sub(r' ', "", str_input)

# change all the punctuation as blank, however, we may split falsely.
# Words get around, the step can also split at wrong place, so, I do not fix this mistake.

str_head = 0
str_tail = len(str_input)
ptr = 5
temp = 0
step = 5

result = []
ch_index = []

while temp < str_tail-1:
    flag = 0
    ptr = 5
    while flag != 1:
        in_put = str_input[temp:temp + ptr]

        tail = len(copy)
        head = 0
        half = int((tail + head) / 2)

        while tail != half and head != half:
            if operator.lt(copy[half], in_put):
                # 如果字符组的一半比input小
                head = half
                half = int((tail + head) / 2)

            elif operator.gt(copy[half], in_put):
                # 如果字符组的一半比input大
                tail = half
                half = int((tail + head) / 2)

            else:
                # print(in_put, end='/')
                result.append(in_put)
                ch_index.append(half)
                flag = 1
                temp += len(in_put)
                break

        if ptr == 0 and temp <= len(str_input)-1:
            # print(str_input[temp], end='/')
            result.append(str_input[temp])
            ch_index.append(-1)
            temp += 1
            flag = 1

        if flag == 0:
            ptr -= 1


group = result
centroid = ch_index

# group = input("Please input some numbers spit as blank:\n").split(" ")
# group_num = len(group)

# for element in group:
#     centroid.append(int(element))

precision = 0
for element in group:
    precision = len(element) if len(element) > precision else precision

group_num = len(group)

while group_num != 2:
    # print("the numbers of groups now is ", group_num, "\n")

    matrix = [[] for i in range(group_num)]

    for i in range(group_num):
        for j in range(group_num):
            distance = abs(int(centroid[i]) - int(centroid[j]))
            matrix[i].append(distance)

    # --------------- matrix ---------------

    # print("distance matrix :")
    # for i in range(group_num):
    # print(matrix[i])
    # matrix contains the distance between every two elements

    # print("------------")

    max_in_matrix = 0

    for i in range(group_num):
        for j in range(group_num):
            max_in_matrix = max_in_matrix if max_in_matrix > matrix[i][j] else matrix[i][j]

    # print(max_in_matrix)
    # if max_in_matrix == 0:
    #     break

    for i in range(group_num):
        for j in range(group_num):
            matrix[i][j] /= max_in_matrix
            matrix[i][j] = round(1 - matrix[i][j], precision) if round(1 - matrix[i][j], precision) != 1 else 0

    # print("standard matrix :")
    # for i in range(group_num):
    #     print(matrix[i])

    # print("------------")
    # standard the matrix

    similarity = 0
    for i in range(group_num):
        for j in range(group_num):
            similarity = similarity if similarity > matrix[i][j] else matrix[i][j]

    # print("max similarity in the matrix: ", max_in_matrix, "\n")
    # --------------- matrix ---------------

    # find the max similarity in this matrix

    temp_class = []

    index = []
    flag = 0

    for i in range(group_num):
        for j in range(group_num):
            if matrix[i][j] == similarity:
                index.append(i)
                index.append(j)
                flag = 1

                temp_class.append(group[i])
                temp_class.append(group[j])

            if flag == 1:
                break
        if flag == 1:
            break
    # find the first center index of new group

    group_num = len(group)

    for i in range(group_num):
        if matrix[index[0]][i] == similarity and i != index[1]:
            temp_class.append(group[i])
            index.append(i)

    for i in range(group_num):
        if matrix[index[1]][i] == similarity and i != index[0]:
            temp_class.append(group[i])
            index.append(i)

    new_centroid = 0
    for element in index:
        new_centroid += centroid[element]

    new_centroid /= len(index)

    for element in index:
        group[element] = 'substitute'
        centroid[element] = 'substitute'

    lenth = len(group)
    temp_flag = 0
    while temp_flag != 1:
        temp_flag = 1
        for i in range(0, lenth):
            if group[i] == 'substitute':
                del group[i]
                lenth = len(group)
                temp_flag = 0
                break

    lenth = len(centroid)

    temp_flag = 0
    while temp_flag != 1:
        temp_flag = 1
        for i in range(0, lenth):
            if centroid[i] == 'substitute':
                del centroid[i]
                lenth = len(centroid)
                temp_flag = 0
                break

    group.append(temp_class)
    centroid.append(new_centroid)
    group_num = len(group)

print(group)
tree(group)

你可能感兴趣的:(简易中文分词聚类(Python))