利用python的sklearn开源包进行文本挖掘

从网站抓取训练样本数据,代码:

import urllib2
from BeautifulSoup import BeautifulSoup
import sys
import re
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

url=['http://news.baidu.com/n?cmd=4&class=mil&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=finannews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=internet&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=housenews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=autonews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=sportnews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=enternews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=gamenews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=edunews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=healthnews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=technnews&pn=1&from=tab'\
     ,'http://news.baidu.com/n?cmd=4&class=socianews&pn=1&from=tab']

ff=['E:/baidu/军事.txt'\
    ,'E:/baidu/财经.txt'\
    ,'E:/baidu/互联网.txt'\
    ,'E:/baidu/房产.txt'\
    ,'E:/baidu/汽车.txt'\
    ,'E:/baidu/体育.txt'\
    ,'E:/baidu/娱乐.txt'\
    ,'E:/baidu/游戏.txt'\
    ,'E:/baidu/教育.txt'\
    ,'E:/baidu/女人.txt'\
    ,'E:/baidu/科技.txt'\
    ,'E:/baidu/社会.txt']

for j in range(7,8):
    soup=BeautifulSoup(urllib2.urlopen(url[j]).read())
    main=soup.find('div',{'class':'p2'})
    index=main.findAll('a')
    len_0=len(index)
    
    a=[]
    for i in range(len_0):
        a.append(index[i]['href'])
    
    for i in range(len_0):
        try:
            soup=BeautifulSoup(urllib2.urlopen(a[i]).read())
            txt=soup.findAll(text=re.compile(ur"[\u4e00-\u9fa5]+"))
            txt_=''.join(txt)
            f= open(ff[j],'a')
            print >>f,txt_
            f.close()
        except:
            continue



有监督学习的文本分类代码:

import jieba
import os
import sys
import codecs
from sklearn import feature_extraction
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB

#--------------#
def load_data():
    corpus_train=[]
    target_train=[]
    
    filepath='E:\python_pananteng/程序6:文本挖掘/文本分类/实例2/train'
    filelist = os.listdir(filepath)
    
    for num in range(len(filelist)):
        filetext=filepath+"/"+filelist[num]
        filename=os.path.basename(filetext) 
        myfile = codecs.open(filetext, 'r','utf-8')
        temp=myfile.readlines()
        myfile.close()
        for i in range(0,100):
            len_0=len(temp)
            seg_list=jieba.cut(','.join(temp[int(i*len_0/100):int((i+1)*len_0/100)]), cut_all=False)
            words=" ".join(seg_list)
            target_train.append(filename)
            corpus_train.append(words)
#--------------#
    
    
    corpus_test=[]
    target_test=[]
    
    filepath='E:\python_pananteng/程序6:文本挖掘/文本分类/实例2/test'
    filelist = os.listdir(filepath)
    
    for num in range(len(filelist)):
        filetext=filepath+"/"+filelist[num]
        myfile = open(filetext, 'r')
        temp=myfile.readlines()
        myfile.close()
        seg_list=jieba.cut(','.join(temp[1:]), cut_all=False)
        words=" ".join(seg_list)
        target_test.append(temp[0])
        corpus_test.append(words)
    return [[corpus_train,target_train],[corpus_test,target_test]]
#--------------#

def data_pro():
    [[corpus_train,target_train],[corpus_test,target_test]]=load_data()
    
    count_v1=CountVectorizer()
    #该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    
    counts_train=count_v1.fit_transform(corpus_train)
    #fit_transform是将文本转为词频矩阵
    
    transformer=TfidfTransformer()
    #该类会统计每个词语的tf-idf权值
    
    tfidf_train=transformer.fit(counts_train).transform(counts_train)
    #fit_transform是计算tf-idf
    
    weight_train=tfidf_train.toarray()
    #weight[i][j],第i个文本,第j个词的tf-idf值
    
    count_v2=CountVectorizer(vocabulary=count_v1.vocabulary_)
    #让两个CountVectorizer共享vocabulary
    
    counts_test=count_v2.fit_transform(corpus_test)
    #fit_transform是将文本转为词频矩阵
    
    transformer=TfidfTransformer()
    #该类会统计每个词语的tf-idf权值
    
    tfidf_test=transformer.fit(counts_train).transform(counts_test)
    #fit_transform是计算tf-idf
    
    weight_test=tfidf_test.toarray()
    #weight[i][j],第i个文本,第j个词的tf-idf值
    return [[weight_train,target_train],[weight_test,target_test]]
#--------------#



[[weight_train,target_train],[weight_test,target_test]]=data_pro()
#---------------------------------------------#
knnclf = KNeighborsClassifier()
knnclf.fit(weight_train,target_train) 
knn_pred = knnclf.predict(weight_test)
#knn模型
#---------------------------------------------#



#---------------------------------------------#
#svm模型
svc = svm.SVC(kernel='linear')
svc.fit(weight_train,target_train) 
svc_pred = svc.predict(weight_test)
#---------------------------------------------#



#---------------------------------------------#
#tree模型
tre = tree.DecisionTreeClassifier()
tre.fit(weight_train,target_train) 
tre_pred = tre.predict(weight_test)
#---------------------------------------------#



#---------------------------------------------#
#bayes模型
bayes = MultinomialNB(alpha = 0.01)
bayes.fit(weight_train,target_train) 
bayes_pred = bayes.predict(weight_test)
#---------------------------------------------#

调用两个开源库,分别是

1、结巴中文分词库,运用该库对网页抓取的中文文章进行分词

2、sklearn机器学习库,调用里面的算法有:tf-idf算法,将文本转换为特征数字矩阵;及knn算法、svm算法、naivebeyes算法、cart算法,这三个算法都是分类的算法,作用是对网页抓取的文章进行有监督的分类学习


效果:

训练:样本 1001 个,其中有3类文章,

第一类,互联网类,样本数量 300

第二类,军事类,样本数量309

第三类,财经类,样本数量302

 

测试:150个测试样本 

 KNN算法,命中118个,错误32

SVM算法,命中125个,错误25

CART算法,命中122个,错误28

Bayes算法,命中130个,错误20





你可能感兴趣的:(数据挖掘(python))