excel 格式
excel.py
# -*- coding: utf-8 -*-
import xdrlib ,sys
import xlrd
import json
def open_excel(file= '/home/lhy/data/data.xlsx'):
try:
data = xlrd.open_workbook(file)
return data
except Exception,e:
print str(e)
#根据索引获取Excel表格中的数据 参数:file:Excel文件路径 colnameindex:表头列名所在行的所以 ,by_index:表的索引
def excel_table_byindex(file= '/home/lhy/data/data.xlsx',colnameindex=0,by_index=0):
data = open_excel(file)
table = data.sheets()[by_index]
nrows = table.nrows #行数
ncols = table.ncols #列数
colnames = table.row_values(colnameindex) #某一行数据
list =[]
for rownum in range(1,nrows):
row = table.row_values(rownum)
if row:
app = {}
for i in range(len(colnames)):
app[colnames[i]] = row[i]
list.append(app)
return list
#根据名称获取Excel表格中的数据 参数:file:Excel文件路径 colnameindex:表头列名所在行的所以 ,by_name:Sheet1名称
#def excel_table_byname(file= '/home/lhy/data/data.xlsx',colnameindex=0,by_name=u'Sheet1'):
def excel_table_byname(file='/home/lhy/data/data.xlsx', colnameindex=0, by_name=u'word'):
data = open_excel(file)
table = data.sheet_by_name(by_name)
nrows = table.nrows #行数
colnames = table.row_values(colnameindex) #某一行数据
list =[]
for rownum in range(1,nrows):
row = table.row_values(rownum)
if row:
app = {}
for i in range(len(colnames)):
app[colnames[i]] = row[i]
list.append(app)
return list
def main():
tables = excel_table_byindex()
for row in tables:
'''print row.decode('utf-8')'''
wenti=row[u'问题']
# wenti=wenti[1:len(wenti)-1]
print json.dumps(wenti, encoding="UTF-8", ensure_ascii=False)
#print type(row)
# tables = excel_table_byname()
# for row in tables:
# print row
if __name__=="__main__":
main()
TextFenci.py
# -*- coding: UTF-8 -*-
import jieba.posseg as pseg
import excel
import json
def getWordXL():
#words=pseg.cut("对这句话进行分词")
list=excel.excel_table_byindex();
aList = []
for index in range(len(list)):
wenti = list[index][u'问题']
words = pseg.cut(wenti)
word_str=""
for key in words:
#aList.insert()import json
# print type(key)
word_str=word_str+key.word+" "
# print key.word," ",
aList.insert(index,word_str)
return aList,list #第一个参数为分词结果,第儿歌参数为原始文档
def main():
aList=getWordXL()
print "1234"
print json.dumps(aList, encoding="UTF-8", ensure_ascii=False)
if __name__=="__main__":
main()
TF_IDF.py
# -*- coding: UTF-8 -*-
import jieba.posseg as pseg
import excel
import json
def getWordXL():
#words=pseg.cut("对这句话进行分词")
list=excel.excel_table_byindex();
aList = []
for index in range(len(list)):
wenti = list[index][u'问题']
words = pseg.cut(wenti)
word_str=""
for key in words:
#aList.insert()import json
# print type(key)
word_str=word_str+key.word+" "
# print key.word," ",
aList.insert(index,word_str)
return aList,list #第一个参数为分词结果,第儿歌参数为原始文档
def main():
aList=getWordXL()
print "1234"
print json.dumps(aList, encoding="UTF-8", ensure_ascii=False)
if __name__=="__main__":
main()
KMeans.py
# -*- coding: utf-8 -*-
from sklearn.cluster import KMeans
import TF_IDF
import json,sys
reload(sys)
sys.setdefaultencoding('utf-8')
weight, textList = TF_IDF.getTFIDF()
def getCU(leibieNum):
LEIBI=leibieNum #100个类别
#print "####################Start Kmeans:分成"+str(LEIBI)+"个类"
clf = KMeans(n_clusters=LEIBI)
s = clf.fit(weight)
#print s
# 20个中心点
#print(clf.cluster_centers_)
# 每个样本所属的簇
#print(clf.labels_)
i = 1
textFencuList=[]
for i in range(0,LEIBI):
textFencu2=[]
textFencuList.append(textFencu2)
for i in range(len(clf.labels_)):
try:
textFencuList[clf.labels_[i - 1]].append(textList[i])
except Exception, e:
print "#######错误:"+str(clf.labels_[i - 1])+" "+str(i)
fo = open("/home/lhy/data/wbjl.txt", "wb")
for index in range(len(textFencuList)):
fo.write("\n#############################第"+str(index)+"个分类##################\n"); # 写入文件
print ""
print "#############################第"+str(index)+"个分类##################";
print ""
for ab in textFencuList[index]:
thisword=json.dumps(ab, encoding="UTF-8", ensure_ascii=False)
#thisword = json.dumps(ab)
fo.write(thisword + "\n") # 写入文件
print thisword
fo.close();
# 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
print("############评估因子大小,用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数:类别"+str(LEIBI)+" 因子"+str(clf.inertia_))
getCU(300)
'''for index in range(100,1000,10):
getCU(index)
'''