文本分类流程:
1、特征选取:
网页爬取-》网页预处理获取汉字-》中文分词及词性标注-》保留名词-》词频统计-》输出词-文档矩阵-》计算词的信息增益值-》选取增益大的前N个词作为特征词-》计算每个词在文档中的权重-》生成文档-词矩阵-》对词的权重进行归一化处理-》即每个训练文档特征词及权重选取完毕。
2、匹配流程
获取新的网页重复上面步骤到分词,得到分词后统计此文档所对应的特征词的词频-》采用KNN等机器学习算法对要分类的文档与现有训练样本中的每个样本进行夹角余弦计算-》夹角余弦最大值为其所最接近的分类,分类成功。
KNN算法实现文本分类,程序及输出结果如下:
1、程序
# -*- coding: cp936 -*-
import urllib2
import re
import sys
import os
import math
#os.system(cmd)os.system('ls')
#class MyClass:
# key_word
doc_no = 1
def start_fun():
reload(sys)
sys.setdefaultencoding('utf-8')
print sys.getdefaultencoding()
def testfun():
str_test = "http://hao123.com\" class is >china</b>"
searchstr= re.findall(r'http://(.*?)\"',str_test)
print searchstr[0]
def split_word(webpage_chinese_file,webpage_chinese_word_file):
#调用命令行对中文文档进行分词处理
os.system('cd splitword && ictclas_demo_c.exe ../'+ webpage_chinese_file + ' ../'+webpage_chinese_word_file )
def doc_word_count_deal(webpage_chinese_word_file,webpage_all_word_count_docno):
global doc_no
word_dicts = dict()
#1、读取中文分词内容到缓存中
word_file = open(webpage_chinese_word_file,"r")
word_buf = word_file.read()
word_file.close()
#2、将分词以空格分隔并按/过滤提取分词中文内容和词性,不是名词或者长度小于2的不进行记录
word_sets = word_buf.split(' ')
for i in word_sets:
#print i
if i == "" :
continue
j = i.index('/')
#print 'j='+str(j)
k = i[j+1:j+2]
i = i[0:j]
#print i
#word_dicts[i]如果不存在KEY则会报KeyError错误
if len(i) <= 2 or (k != None and k != 'n'):
#print 'k='+k
continue
if word_dicts.get(i) == None :
word_dicts[i] = 1
else:
word_dicts[i] = word_dicts[i] + 1
#sorted(word_dicts.viewvalues())
#list ->word_dicts = [ v for v in sorted(word_dicts.values())]
#3、将过滤后的中文分词按照出现次数进行排序,将排序好的数据存储到相应的文件中
word_count = open(webpage_all_word_count_docno,'w')
word_dicts_list = sorted([(v, k) for k, v in word_dicts.items()], reverse=True)
for i in word_dicts_list:
print i[1],i[0]
word_count.write(str(doc_no)+" "+i[1]+" "+str(i[0])+"\n")
word_count.close()
doc_no = doc_no + 1
"""
for i in word_dicts.viewkeys():
print i,word_dicts[i]
"""
def get_webpage_china(url_addr,webpage_src_file,webpage_chinese_file):
#reload( sys )
#sys.setdefaultencoding('utf-8')
print sys.getdefaultencoding()
#1、获取URL地址网页内容,默认为ascii,GB2312编码
url = url_addr
content = urllib2.urlopen(url).read()
#2、打开文件将网页内容写入文件
file = open(webpage_src_file,'w')
file.write(content)
file.close
#str=f.read()
#fp.write(str.encode("utf-8"))
#3、正则匹配获取网页中alert中所对应的内容
"""
pattern1 = re.findall(r'alert\(\"(.*)\"\)',content)
for i in pattern1:
print i
print 'hello world!\n'
sub1 = re.sub(r'alert\(\"(.*)\"\)','Hello World!',content)
"""
#chinaeseStr = re.match(ur".*[\u4e00-\u9fa5]+",content)
#a="<p class=\"w490\"> 百度知道</p> "
#(?<=>)[^a-zA-Z0-9_]+(?=<) 正则 ?<=只匹配最前面不进入分组,?=匹配最后不进入分组
#4、获取网页中文内容
chinaeseStr= re.findall(ur"[\u4e00-\u9fa5]+",content.decode('GB2312'))
#5、将中文内容写入文件
file_chinese = open(webpage_chinese_file,'w')
for i in chinaeseStr:
#print i
file_chinese.write(i.encode('GB2312'))
file_chinese.close()
def KNN(c1,c2):
list1 = []
list2 = []
list3 = []
dict1 = dict()
ig_dict = dict()
features_dict = dict()
weigh_dict = dict()
weigh_doc_matrix_dict = dict()
class_count = 2
doc_count = 3
c1_file = open(c1,"r")
#c1_word_sets = c1_file.read()
for line in c1_file:
list1.append(line.split(" "))
c2_file = open(c2,"r")
for line in c2_file:
list1.append(line.split(" "))
#1、计算word-doc—count矩阵
"""
分辨率 [2, 2, 0] 2 2
用户名 [1, 1, 0] 1 1
鼠标 [0, 1] 0 1
密码技术 [1, 1, 0] 1 1
"""
for i in list1:
#print i[0],i[1],i[2]
if dict1.get(i[1]) == None:
list2 = list()
list2 = [0]
if i[2] == None:
list2.insert((int)(i[0]) - 1,0)
else:
list2.insert((int)(i[0]) - 1,(int)(i[2]))
dict1[i[1]] = list2
else:
if i[2] == None:
dict1[i[1]].insert((int)(i[0]) - 1,0)
else:
dict1[i[1]].insert((int)(i[0]) - 1,(int)(i[2]))
#2、计算每个Word的信息增益IG并保存到dit中
print "0、rd-doc—count矩阵"
for dict_cont in dict1.viewkeys():
print dict_cont,dict1[dict_cont][0],dict1[dict_cont][1]
t= class_count
entropy = class_count * 1/class_count * (math.log(1,10) - math.log(t,10));
wcount = 0 # // 出现word的文档的文档数量
category_doc_count = doc_count/class_count #每个类别中的文档数量
wcount_class = [0 for i in range(class_count)] #// 每个类别中出现单词word的文档数
pw = 0.0# // 出现word的文档占全部文档的比重
pcw = [0 for i in range(class_count)]# // 在单词word出现时各个类别中文档数占总文档数的比重
pcw_b = [0 for i in range(class_count)]#// 在单词word不出现时各个类别中文档数占总文档数的比重
#listabc = [0 for i in range(100)] or [0] * 100
for i in range(0,class_count):
for j in range(0,category_doc_count):
if dict1[dict_cont][j + i * category_doc_count] > 0 :
wcount_class[i] += 1
wcount += wcount_class[i]
print wcount, wcount_class
pw = 1.0 * wcount / doc_count;
for i in range(0,class_count):
pcw[i] = 1.0 * wcount_class[i] / wcount;
pcw_b[i] = 1.0 * (category_doc_count - wcount_class[i])/ (doc_count - wcount);
d1 = 0.0;
d2 = 0.0;
for i in range(0,class_count):
#print pcw[i],pcw_b[i]
if pcw[i] == 0:
t1 = 0
else:
t1 = math.log(pcw[i],10)
d1 += pcw[i] * t1;
if pcw_b[i] == 0:
t2 = 0
else:
t2 = math.log(pcw_b[i],10)
d2 += pcw_b[i] * t2;
ig = entropy + pw * d1 + (1.0 - pw) * d2;
ig_dict[dict_cont] = ig
#3、打印信息增益信息
print "1、word IG"
for dict_cont in ig_dict.viewkeys():
print dict_cont,ig_dict[dict_cont]
print "1、sort word IG"
word_dicts_list = sorted([(v, k) for k, v in ig_dict.items()], reverse=True)
j = 1
for i in word_dicts_list:
print i[1],i[0]
if j == 6 :
continue
features_dict[i[1]] = i[0]
j += 1
print "2、feature values"
for i in features_dict.viewkeys():
print i,features_dict[i]
#4、计算权重及归一化
for dict_cont in dict1.viewkeys():
#print dict_cont,dict1[dict_cont][0],dict1[dict_cont][1]
if features_dict.get(dict_cont) == None:
continue
tf = 0 #特征项在本文档中出现的频率
Ni = 0 #出现特征项的文档数目
N = doc_count #
for i in range(doc_count - 1):
if dict1[dict_cont][i] > 0 :
Ni += 1
for i in range(doc_count - 1):
tf = dict1[dict_cont][i]
weight = -1.0 * tf * (math.log(1.0,10) + math.log(Ni,10) - math.log(N,10))
#print "********"+str(i)+"*****"+str(tf)+"***"+str(weight)
if weigh_dict.get(dict_cont) == None:
weight_list = [0.0]*doc_count
weigh_dict[dict_cont] = weight_list
weigh_dict[dict_cont][i] = weight
else:
weigh_dict[dict_cont][i] = weight
print "3、weight values"
for j in range(doc_count-1):
for i in weigh_dict.viewkeys():
print i,weigh_dict[i][0],weigh_dict[i][1]
if weigh_doc_matrix_dict.get(j) == None:
weith_doc_list = dict()
weigh_doc_matrix_dict[j] = weith_doc_list
weigh_doc_matrix_dict[j][i] = weigh_dict[i][j]
else:
weigh_doc_matrix_dict[j][i] = weigh_dict[i][j]
print "4、归一化doc-word martix"
weight_sum = [0.0]*doc_count
for i in weigh_doc_matrix_dict.viewkeys():
for j in weigh_doc_matrix_dict[i].viewkeys():
weight_sum[i] += weigh_doc_matrix_dict[i][j]
test_doc_matrix_dict = dict()
ttstr = ""
ttstr1 = ""
for i in weigh_doc_matrix_dict.viewkeys():
ttstr = ""
ttstr1 = ""
print "doc num " + str(i)
for j in weigh_doc_matrix_dict[i].viewkeys():
weigh_doc_matrix_dict[i][j] /= weight_sum[i]
ttstr += " " + str(weigh_doc_matrix_dict[i][j])
ttstr1 += " " + j
test_doc_matrix_dict[j] = 1
print ttstr1
print ttstr
#5、获取新样本的特征词出现次数统计与现有2个文档进行KNN算法计算,计算夹角余弦大的说明越接近
res = 0.0
mul = 0.0
p1 = 0.0
p2 = 0.0
max_res = 0.0
print "5、KNN分类算法对如下样本进行分类"
for i in weigh_doc_matrix_dict.viewkeys():
for j in weigh_doc_matrix_dict[i].viewkeys():
one = weigh_doc_matrix_dict[i][j]
two = test_doc_matrix_dict[j]
print j,two
mul += one * two
p1 += math.pow(one,2)
p2 += math.pow(two,2)
res = math.fabs(mul) / math.sqrt(p1 * p2)
if res > max_res:
max_res = res
class_no = i
print "样本" + str(i) + "计算结果=" + str(res)
print "最终分类编号为:" + str(class_no)
"""
69 // 计算两个向量的夹角的余弦。如果此值的绝对值越大,说明夹角越小,越相似,距离越近。
70 public double cos(Vector<Double> v1, Vector<Double> v2, int len) {
71 double res = 0.0;
72 double mul = 0.0;
73 double p1 = 0.0, p2 = 0.0;
74 for (int i = 0; i < len; i++) {
75 double one = v1.get(i);
76 double two = v2.get(i);
77 mul += one * two;
78 p1 += Math.pow(one, 2);
79 p2 += Math.pow(two, 2);
80 }
81 res = Math.abs(mul) / Math.sqrt(p1 * p2);
82 return res;
"""
def main():
url_addr = 'http://192.168.1.170:8000/jsoa/CheckUser.jspx'
webpage_src_file = 'webpage_src_file.txt'
webpage_chinese_file = 'webpage_chinese_file.txt'
webpage_chinese_word_file = 'webpage_chinese_word_file.txt'
webpage_all_word_count_docno = 'webpage_all_word_count_docno.txt'
webpage_all_word_count_docno1 = 'webpage_all_word_count_docno1.txt'
webpage_all_word_count_docno2 = 'webpage_all_word_count_docno2.txt'
get_webpage_china(url_addr,webpage_src_file,webpage_chinese_file)
split_word(webpage_chinese_file,webpage_chinese_word_file)
doc_word_count_deal(webpage_chinese_word_file,webpage_all_word_count_docno)
KNN(webpage_all_word_count_docno1,webpage_all_word_count_docno2)
main()
2、输出结果
ascii
系统 5
账号 2
密码 2
分辨率 2
用户名 1
用户 1
软件 1
苹果 1
密码技术 1
宽度 1
火狐 1
0、rd-doc—count矩阵
分辨率 2 2
2 [1, 1]
用户名 1 1
2 [1, 1]
鼠标 0 1
1 [0, 1]
密码技术 1 1
2 [1, 1]
密码 2 2
2 [1, 1]
账号 2 2
2 [1, 1]
电脑 0 1
1 [0, 1]
火狐 1 1
2 [1, 1]
系统 5 3
2 [1, 1]
苹果 1 1
2 [1, 1]
软件 1 1
2 [1, 1]
用户 1 1
2 [1, 1]
宽度 1 1
2 [1, 1]
键盘 0 1
1 [0, 1]
1、word IG
分辨率 -0.50171665944
用户 -0.50171665944
鼠标 -0.401373327552
密码技术 -0.50171665944
系统 -0.50171665944
账号 -0.50171665944
电脑 -0.401373327552
软件 -0.50171665944
密码 -0.50171665944
苹果 -0.50171665944
火狐 -0.50171665944
用户名 -0.50171665944
宽度 -0.50171665944
键盘 -0.401373327552
1、sort word IG
鼠标 -0.401373327552
键盘 -0.401373327552
电脑 -0.401373327552
账号 -0.50171665944
用户名 -0.50171665944
用户 -0.50171665944
系统 -0.50171665944
软件 -0.50171665944
苹果 -0.50171665944
密码技术 -0.50171665944
密码 -0.50171665944
宽度 -0.50171665944
火狐 -0.50171665944
分辨率 -0.50171665944
2、feature values
电脑 -0.401373327552
账号 -0.50171665944
鼠标 -0.401373327552
用户名 -0.50171665944
键盘 -0.401373327552
3、weight values
鼠标 0.0 0.47712125472
电脑 0.0 0.47712125472
账号 0.352182518111 0.352182518111
用户名 0.176091259056 0.176091259056
键盘 0.0 0.47712125472
鼠标 0.0 0.47712125472
电脑 0.0 0.47712125472
账号 0.352182518111 0.352182518111
用户名 0.176091259056 0.176091259056
键盘 0.0 0.47712125472
4、归一化doc-word martix
doc num 0
电脑 账号 鼠标 用户名 键盘
0.0 0.666666666667 0.0 0.333333333333 0.0
doc num 1
电脑 账号 鼠标 用户名 键盘
0.24347423677 0.179718193127 0.24347423677 0.0898590965636 0.24347423677
5、KNN分类算法对如下样本进行分类
电脑 1
账号 1
鼠标 1
用户名 1
键盘 1
样本0计算结果=0.6
电脑 1
账号 1
鼠标 1
用户名 1
键盘 1
样本1计算结果=0.718992940348
最终分类编号为:1