nlp(一)语种检测

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
# NGram类计算字母序列出现的频率
class NGram(object):
    # # text 一个unicode的文本输入
    # n作为定义字符序列的长度
	def __init__(self, text, n=3):
		self.length = None
		self.n = n
		self.table = {}
		self.parse_text(text)
		self.calculate_length()
    # 得到3-gram 的词典
	def parse_text(self, text):
		chars = ' ' * self.n # initial sequence of spaces with length n
		for letter in (" ".join(text.split()) + " "):
			chars = chars[1:] + letter # append letter to sequence of length n
			self.table[chars] = self.table.get(chars, 0) + 1 # increment count
    # NGram对象的长度
	def calculate_length(self):
		""" Treat the N-Gram table as a vector and return its scalar magnitude
		to be used for performing a vector-based search.
		"""
		self.length = sum([x * x for x in self.table.values()]) ** 0.5
		return self.length
    # 两个NGram对象之间的减法
	def __sub__(self, other):
		""" Find the difference between two NGram objects by finding the cosine
		of the angle between the two vector representations of the table of
		N-Grams. Return a float value between 0 and 1 where 0 indicates that
		the two NGrams are exactly the same.
		"""
		if not isinstance(other, NGram):
			raise TypeError("Can't compare NGram with non-NGram object.")
 
		if self.n != other.n:
			raise TypeError("Can't compare NGram objects of different size.")
 
		total = 0
		for k in self.table:
			total += self.table[k] * other.table.get(k, 0)
 
		return 1.0 - (float(total) )/ (float(self.length) * float(other.length))


if __name__=='__main__':
    training_text={'English':'training set EN.txt', 'German':'training set DE.txt', 'French':'training set FR.txt', 'Italian':'training set IT.txt'}
    encodingType = "UTF-8"
    readType = 'r'
    for n in range(1, 11):
        languages = {}
        filename = str(n) + '.txt'
        text_sfile = open(filename, readType, encoding=encodingType, errors='ignore')
        text_set = text_sfile.read()
        text_sfile.close()
        for key in training_text:
            training_file = open(training_text[key], 'r')
            training_set = training_file.read()
            training_file.close()
            languages[key] = NGram(training_set, n=3)-NGram(text_set, n=3)
        print(filename, "is", min(languages.items(), key=lambda x: x[1])[0])

1、实验结果

nlp(一)语种检测_第1张图片

2、实验环境

Win8.1    Anaconda3.6.0 PyCharm2016.1.4

3、语料

四种语料,10个测试集



你可能感兴趣的:(nlp(一)语种检测)