libsvm 数据预处理 模块化程序

 
 
 
 
实验框架图见 libsvm文本分类:二分类(二) 实验框架图 下面是主模块代码,暂不公布全部代码
libsvm 数据预处理 模块化程序 代码
#  -*- codin g: cp936 -*-
#
coding gb2312
from  SVM  import  FoldersCreation
import  os
# #############################################################################################
#
参数设计
N = 100   # N: half of total corpus size
vfold = 5   # vfold: 循环验证的次数 
featureDimension = 2000   # featureDimension:VSM模型特征维度
toCalInfoGain = 0 # 是否计算词袋子模型中的词集合的信息增益=1则不计算
count_done_research_times = # 已经进行了几次试验
#
 N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
#
featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
#
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# #############创建文件夹########################################################################
os.mkdir(r ' D:\TextCategorization ' )
FoldersCreation.CreateAssist()
print   ' 创建文件夹模块运行结束 '
print   ' *************************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# ###############处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
from  SVM  import  CorpusPartition
CorpusPartition.MoveCorpus(N)

CorpusPartition.moveAccordingPartition(N,count_done_research_times)
print   ' 分割文本集模块运行结束 '
print   ' ******************************************************************* '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


# ########################文档集合分词##########################################################
from  SVM  import  DataManager
from  ctypes  import   *
import  os
import  cPickle as p
import  re
roots
= [r ' D:\TextCategorization\training ' ,r ' D:\TextCategorization\testing ' ]
rootfinals
= [r ' D:\TextCategorization\segmented ' ,r ' D:\TextCategorization\tsegmented ' ]
# root=r'D:\TextCategorization\training'
#
rootfinal=r'D:\TextCategorization\segmented'


for  i  in  range(0, 2 ):
    dm
= DataManager.DataManager(roots[i])
    subdir
= dm.GetSubDir()
    filepathstotalsrc
= []
    
for  sub   in  subdir:
        dm.SetFilePathsFromsubDir(roots[i]
+ os.sep + sub)
        filepaths
= dm.GetFilePaths()
        filepathsassist
= [sub + os.sep + path  for  path  in  filepaths ]
        filepathstotalsrc
= filepathstotalsrc + filepathsassist  
    
for  path  in  filepathstotalsrc:
        myfile
= file(roots[i] + os.sep + path)
        s
= myfile.read()
        myfile.close()
        dll
= cdll.LoadLibrary( " ICTCLAS30.dll " )    
        dll.ICTCLAS_Init(c_char_p(
" . " ))  
        bSuccess 
=  dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
        segmented
= c_char_p(bSuccess).value
        segmentedtmp
= re.sub( " \s+ " , ' | ' ,segmented,0)
        segmentedfinal
= re.sub( ' \xa1\xa1 ' , '' ,segmentedtmp)
        fid
= file(rootfinals[i] + os.sep + path, ' w ' )
        fid.write(segmentedfinal)
        fid.close()
        dll.ICTCLAS_Exit()
        
# print 'finalfinish congratulations!'     
print   ' 文档集分词模块运行结束 '
print   ' ********************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# #################建立词袋子模型######################################################################
from  SVM  import  BagOfWordsConstruction
BagOfWordsConstruction.BagOfWordsConstruction(r
' D:\TextCategorization\segmented ' )
print   ' 建立词袋子模型模块运行结束 '
print   ' *********************************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# ######################特征词选择##################################################################
from  SVM  import  FeatureSelectionModel
featurewords
= FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain, 2 * N / vfold) # feature
import  cPickle as mypickle
fid
= file(r ' D:\TextCategorization\VITData\keywords.dat ' , ' w ' )
mypickle.dump(featurewords,fid)
fid.close()
print   ' 特征词选择模块运行结束 '
print   ' ******************************************************************************************* '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# ######################文档向量模型建立模块##############################################################
from  SVM  import  VSMformation
root1
= r ' D:\TextCategorization\segmented '
root2
= r ' D:\TextCategorization\tsegmented '
print   ' begin..... '
VSMformation.LibSVMFormat(r
' D:\TextCategorization\data\train.libsvm ' ,root1)
print   ' 训练语料库转化完毕 '
VSMformation.LibSVMFormat(r
' D:\TextCategorization\data\test.libsvm ' ,root2)
print   ' 测试语料库转化完毕 '
print   ' 文档向量模型建立模块运行结束 '
print   ' 批处理完毕,congratulations! '

 

你可能感兴趣的:(lib)