导入jieba库提取字词特征,使用xgboost模型解决多新闻分类
新闻总体分成了8类:结构大致如下:
导入训练、测试数据
数据下载
train = pd.read_csv("true_train.csv",encoding="utf-8",sep="\t",header=None)
tset = pd.read_csv("true_test.csv",encoding="utf-8",sep="\t",header=None)
train=train.iloc[:,1:].copy()
test=test.iloc[:,1:].copy()
将一段话切开
jieba教程点这里
def get_words(n,train):
ttt = train.iloc[n-1:n,:]
#list(set(train[0]))
t = ttt[1].values
t =list(t)[0]
seg_list = jieba.cut(t,cut_all=True)#全模式
#seg_list = jieba.cut_for_search(t) #搜索引擎模式s
z= list(seg_list)
z= list(set(z))
return z
def get_words_line(n,train):
ttt = train.iloc[n-1:n,:]
#list(set(train[0]))
t = ttt[1].values
t =list(t)[0]
return str(t)
将字典转化为列表:
def dict2list(dic:dict):
''' 将字典转化为列表 '''
keys = dic.keys()
vals = dic.values()
lst = [(key, val) for key, val in zip(keys, vals)]
return lst
数据处理:
class line_word():
kind_list = ['体育', '汽车', '军事', '科技', '娱乐', '财经', '旅游', '社会']
def __init__(self,data):
self.data =data
def get_1(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[0]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_11(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[0]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_2(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[1]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_22(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[1]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_3(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[2]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_33(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[2]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_4(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[3]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_44(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[3]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_5(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[4]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_55(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[4]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_6(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[5]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_66(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[5]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_7(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[6]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_77(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[6]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
def get_8(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[7]]
t1.index=[x for x in range(len(t1))]
kong1=[]
for i in range(1,len(t1)+1):
kong1.extend(get_words(i,t1))
kong11 = Counter(kong1)
return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
def get_88(self):
t1 = self.data.loc[self.data[0]==line_word.kind_list[7]]
t1.index=[x for x in range(len(t1))]
kong1=""
kong11=[]
for i in range(1,len(t1)+1):
d = get_words_line(i,t1)
kong1 = kong1+d
kong11.append(int(len(d)))
return len(kong1)/len(t1),kong11
得到数据:
def deal_1(dic):
num=0
for i in range(len(dic)):#.remove(dic[0])
if len(dic[i-num][0])<=1:
dic.remove(dic[i-num])
num+=1
return dic
def give_power(f1,f2,f3,f4,f5,f6,f7,f8):
f_list1= [f1,f2,f3,f4,f5,f6,f7,f8]
#f_list =[f1%(f1-1),f2%(f2-1),f3%(f3-1),f4%(f4-1),f5%(f5-1),f6%(f6-1),f7%(f7-1),f8%(f8-1)]
f_np = np.array(f_list1,dtype=np.float64)
#f_sum = f_np.sum()
#for i in range(8):
# f_np[i] = f_np[i]/f_sum
return f_np
def get_word_power():
dic1 = deal_1(line_word(train).get_1())
dic2 = deal_1(line_word(train).get_2())
dic3 = deal_1(line_word(train).get_3())
dic4 = deal_1(line_word(train).get_4())
dic5 = deal_1(line_word(train).get_5())
dic6 = deal_1(line_word(train).get_6())
dic7 = deal_1(line_word(train).get_7())
dic8 = deal_1(line_word(train).get_8())
dic1_key = [dic1[x][0] for x in range(len(dic1))]
dic1_values = [dic1[x][1] for x in range(len(dic1))]
dic2_key = [dic2[x][0] for x in range(len(dic2))]
dic2_values = [dic2[x][1] for x in range(len(dic2))]
dic3_key = [dic3[x][0] for x in range(len(dic3))]
dic3_values = [dic3[x][1] for x in range(len(dic3))]
dic4_key = [dic4[x][0] for x in range(len(dic4))]
dic4_values = [dic4[x][1] for x in range(len(dic4))]
dic5_key = [dic5[x][0] for x in range(len(dic5))]
dic5_values = [dic5[x][1] for x in range(len(dic5))]
dic6_key = [dic6[x][0] for x in range(len(dic6))]
dic6_values = [dic6[x][1] for x in range(len(dic6))]
dic7_key = [dic7[x][0] for x in range(len(dic7))]
dic7_values = [dic7[x][1] for x in range(len(dic7))]
dic8_key = [dic8[x][0] for x in range(len(dic8))]
dic8_values = [dic8[x][1] for x in range(len(dic8))]
all_key =[]
all_values =[]
all_kind =[]
all_key.extend(dic1_key)
all_key.extend(dic2_key)
all_key.extend(dic3_key)
all_key.extend(dic4_key)
all_key.extend(dic5_key)
all_key.extend(dic6_key)
all_key.extend(dic7_key)
all_key.extend(dic8_key)
all_values.extend(dic1_values)
all_values.extend(dic2_values)
all_values.extend(dic3_values)
all_values.extend(dic4_values)
all_values.extend(dic5_values)
all_values.extend(dic6_values)
all_values.extend(dic7_values)
all_values.extend(dic8_values)
dic1_kind = [0 for x in range(len(dic1))]
dic2_kind = [1 for x in range(len(dic2))]
dic3_kind = [2 for x in range(len(dic3))]
dic4_kind = [3 for x in range(len(dic4))]
dic5_kind = [4 for x in range(len(dic5))]
dic6_kind = [5 for x in range(len(dic6))]
dic7_kind = [6 for x in range(len(dic7))]
dic8_kind = [7 for x in range(len(dic8))]
all_kind.extend(dic1_kind)
all_kind.extend(dic2_kind)
all_kind.extend(dic3_kind)
all_kind.extend(dic4_kind)
all_kind.extend(dic5_kind)
all_kind.extend(dic6_kind)
all_kind.extend(dic7_kind)
all_kind.extend(dic8_kind)
all_word_list = pd.DataFrame({"words": all_key,"values": all_values,"kind": all_kind})
return all_word_list
def hanshu_1(df):
df1 = df.iloc[:,:8].copy()
#df1 = df.values[:3].copy()
df1 = np.array(df1,dtype=np.float32)
#print(df1)
return np.var(df1)
def hanshu_2(df):
df1 = df.iloc[:,12:].copy()
#df1 = df.values[:3].copy()
df1 = np.array(df1,dtype=np.float32)
#print(df1)
return np.var(df1)
def get_word_Data():
all_word_list1 = get_word_power()
t =all_word_list1.groupby(["words","kind"],as_index=True).sum().unstack()
t_VALES = t.values
t_index = t.index
all_all_all = pd.DataFrame(t_VALES)
all_all_all["words"] = t_index
all_all_all = all_all_all.fillna(0)
all_all_all["sum_values"] = all_all_all.iloc[:,:8].sum(1)
all_all_all["mean_values"] = all_all_all.iloc[:,:8].mean(1)
all_all_all.sort_values(["sum_values"],ascending=False,inplace=True)
t_var = all_all_all.groupby(["words"]).apply(hanshu_1)
all_all_all = pd.merge(all_all_all,pd.DataFrame({"words": t_var.index.tolist(),"var": t_var.tolist()}),how="left",on=["words"])
all_all_all.sort_values(["var"],ascending=False,inplace=True)
for i in range(8):
all_all_all[line_word.kind_list[i]+str("比例")] = all_all_all[i]/all_all_all["sum_values"]
g_var = all_all_all.groupby(["words"]).apply(hanshu_2)
all_all_all = pd.merge(all_all_all,pd.DataFrame({"words": g_var.index.tolist(),"p_var": g_var.tolist()}),how="left",on=["words"])
all_all_all.sort_values(["var"],ascending=False,inplace=True)
all_all_all["var*p"] = all_all_all["var"] * all_all_all.p_var
all_all_all.index =[x for x in range(len(all_all_all))]
return all_all_all
all_all_all = get_word_Data()
得到每个词语的特征
提取特征
得到每篇信息的前30个出现频率最高的词的特征:8各类的数量以及比例的。数量的方差,比例的方差等等。
class analysis_text_one():
def __init__(self,data,i,all_data):
self.data = data.iloc[i:i+1,:][1].values[0]
self.target = data.iloc[i:i+1,:][0].values[0]
self.all_all_all = all_data
def deal_line(self):
seg_list = jieba.cut(self.data,cut_all=True)#全模式
z= list(seg_list)
zz = Counter(z)
zzz = sorted(dict2list(zz), key=lambda x:x[1], reverse=True)
zzzz = deal_1(zzz)
zzzzz = zzzz[:30]
ttt = zzzzz[0][0]
gg = self.all_all_all.loc[self.all_all_all["words"]==zzzzz[0][0]].loc[:,self.all_all_all.columns[10:].tolist()]
gg_v = gg.values
for i in range(1,30):
if i
使用xgboost模型分类
#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import datetime
import xgboost as xgb
from sklearn.cross_validation import train_test_split
import os
params={
'booster':'gbtree',
# 这里手写数字是0-9,是一个多类的问题,因此采用了multisoft多分类器,
'objective': 'multi:softmax',
'num_class':8, # 类数,与 multisoftmax 并用
'gamma':0.05, # 在树的叶子节点下一个分区的最小损失,越大算法模型越保守 。[0:]
'max_depth':12, # 构建树的深度 [1:]
#'lambda':450, # L2 正则项权重
'subsample':0.4, # 采样训练数据,设置为0.5,随机选择一般的数据实例 (0:1]
'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
#'min_child_weight':12, # 节点的最少特征数
'silent':1 ,
'eta': 0.01, # 如同学习率
'seed':710,
'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}
plst = list(params.items())
#Using 10000 rows for early stopping.
offset1 = 8000 # 训练集中数据12000,划分10000用作训练,2000用作验证
offset2 = 10000
num_rounds = 368 # 迭代次数
xgtest = xgb.DMatrix(train_feature[offset2:])
# 划分训练集与验证集
xgtrain = xgb.DMatrix(train_feature[:offset1,:], label=train_lables[:offset1])
xgval = xgb.DMatrix(train_feature[offset1:offset2,:], label=train_lables[offset1:offset2])
# return 训练和验证的错误率
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
# training model
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgtrain, num_rounds, watchlist,early_stopping_rounds=100)
#model.save_model('./model/xgb.model') # 用于存储训练出的模型
preds = model.predict(xgtest,ntree_limit=model.best_iteration)
print(preds)