request 5:
1)使用自己感兴趣的DL模型对赛题进行研究(例如:RCNN/fasttext/textCNN等),学习理论并用于实践;
2)线上提交结果;
要求:至少完成RCNN代码 + fasttext/textCNN中的一个
RCNN
由于电脑爆memory,可运用以下代码分块读取数据
import sys,csv
import pandas as pd
‘’’
解决报错:_csv.Error: field larger than field limit (131072)
‘’’
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt / 10)
decrement = True
###设置路径
data_path = ‘./study/new_data/’
feature_path = ‘./study/feature/feature_file/’
proba_path = ‘./study/proba/proba_file/’
model_path = ‘./study/model/model_file/’
result_path ="./study/result/"
doc_len = 2000
embedding_dim = 300
print(‘read data’)
doc_len = 2000
embedding_dim = 300
print(‘read data’)
df_train = pd.read_csv(data_path + ‘train_set.csv’,engine=‘python’,encoding=‘gbk’)
df_test = pd.read_csv(data_path + ‘test_set.csv’,engine=‘python’,encoding=‘gbk’)
print (df_train.shape)
df_train.drop(df_train.columns[0],axis=1,inplace=True)
word_seg = df_train[‘word_seg’]
label = df_train[‘class’] - 1
X_train, X_test, y_train, y_test = train_test_split(word_seg, label, test_size=0.1, random_state=42)
####################
print(‘embedding’)
y_labels = list(y_train.value_counts().index)
le = preprocessing.LabelEncoder()
le.fit(y_labels)
num_labels = len(y_labels)
y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), num_labels)
y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]), num_labels)
####################################
tokenizer = Tokenizer(split=’ ‘)
tokenizer.fit_on_texts(word_seg)
vocab = tokenizer.word_index
#打印结果
‘’’
{‘520477’: 1,
‘816903’: 2,
‘1033823’: 3,
‘995362’: 4,
‘920327’: 5,
‘834740’: 6,
‘460600’: 7,
‘54111’: 8,
…}’’’
X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=doc_len)
X_test_padded_seqs = pad_sequences(X_test_word_ids, maxlen=doc_len)
left_train_word_ids = [[len(vocab)] + x[:-1] for x in X_train_word_ids]
left_test_word_ids = [[len(vocab)] + x[:-1] for x in X_test_word_ids]
right_train_word_ids = [x[1:] + [len(vocab)] for x in X_train_word_ids]
right_test_word_ids = [x[1:] + [len(vocab)] for x in X_test_word_ids]
left_train_padded_seqs = pad_sequences(left_train_word_ids, maxlen=doc_len)
left_test_padded_seqs = pad_sequences(left_test_word_ids, maxlen=doc_len)
right_train_padded_seqs = pad_sequences(right_train_word_ids, maxlen=doc_len)
right_test_padded_seqs = pad_sequences(right_test_word_ids, maxlen=doc_len)
document = Input(shape = (doc_len, ), dtype = “int32”)
left_context = Input(shape = (doc_len, ), dtype = “int32”)
right_context = Input(shape = (doc_len, ), dtype = “int32”)
embedder = Embedding(len(vocab) + 1, embedding_dim, input_length = doc_len)
doc_embedding = embedder(document)
l_embedding = embedder(left_context)
r_embedding = embedder(right_context)
print(‘model’)
forward = LSTM(256, return_sequences = True)(l_embedding) # 等式(1)
backward = LSTM(256, return_sequences = True, go_backwards = True)(r_embedding)# 等式(2)
together = concatenate([forward, doc_embedding, backward], axis = 2) # 等式(3)
semantic = TimeDistributed(Dense(128, activation = “tanh”))(together) # 等式(4)
pool_rnn = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (128, ))(semantic)# 等式(5)
output = Dense(19, activation = “softmax”)(pool_rnn) # 等式(6)和(7)
model = Model(inputs = [document, left_context, right_context], outputs = output)
model.compile(loss=‘categorical_crossentropy’,
optimizer=‘adam’,
metrics=[‘accuracy’])
model.fit([X_train_padded_seqs, left_train_padded_seqs, right_train_padded_seqs],y_train,
batch_size=32,
epochs=1,
validation_data=([X_test_padded_seqs, left_test_padded_seqs, right_test_padded_seqs], y_test))
model.save(model_path + ‘textcnn.h5’)
电脑太慢,出不来结果,尝试用google Colab 跑DL model
新增app 文件夹
step1: upload train_set.csv,test_set.csv to gdrive
step2: read gdrive data
from google.colab import drive
drive.mount(’/content/gdrive’)
##会生成authorization code,粘贴Code
import pandas as pd
df_train=pd.read_csv(‘gdrive/My Drive/app/train_set.csv’)
df_test=pd.read_csv(‘gdrive/My Drive/app/test_set.csv’)
步骤与上面相同
生成模型进行评价
score = model.evaluate(X_test_padded_seqs, y_test, verbose=0)
print(‘Test loss:’, score[0])
print(‘Test accuracy:’, score[1])
xx_test_word_ids = tokenizer.texts_to_sequences(df_test[‘word_seg’])
xx_test_padded_seqs = pad_sequences(xx_test_word_ids, maxlen=doc_len)
pred_prob = model.predict(xx_test_padded_seqs)
pred = pred_prob.argmax(axis=1)
df_test[‘class’] = pred.tolist()
df_test[‘class’] = df_test[‘class’] + 1
df_result = df_test.loc[:, [‘id’,‘class’]]
df_result.to_csv(result_path +‘RCNN.csv’,index=False)
全部数据集经常断开,选部分数据跑,精度不高
request 6:
学习内容
1)将Task3-5中的模型进行模型融合(Stacking或Blending融合均可)
2)选取最好的模型,线上提交结果;
使用第三方库mlxtend实现stacking
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
meta_classifier=lr)
print(‘3-fold cross validation:\n’)
for clf, label in zip([clf1, clf2, clf3, sclf],
[‘KNN’,
‘Random Forest’,
‘Naive Bayes’,
‘StackingClassifier’]):
scores = model_selection.cross_val_score(clf, X, y,
cv=3, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
###电脑性能不佳,stacking跑结果很慢,电脑not connected,通过相关博客,原理理解透彻,后续有时间前面的模型每种都stacking看下结果
stacking参考博客:
https://blog.csdn.net/qq_20386411/article/details/82985219
https://blog.csdn.net/winycg/article/details/84032459
https://blog.csdn.net/willduan1/article/details/73618677
https://blog.csdn.net/winycg/article/details/84032459
https://blog.csdn.net/wstcjf/article/details/77989963
https://blog.csdn.net/weixin_38569817/article/details/80534785