Q1
4_1和4_2作为训练集
import pandas as pd
train1 = pd.read_table('question_4_1.txt', sep='\t', header=None)
train2 = pd.read_table('question_4_2.txt', sep='\t', header=None)
train = pd.concat([train1, train2], axis=0)
names = ['text','sentiment']
train.columns = names
4_3作为测试集
test = pd.read_table('question_4_3.txt', sep='\t', header=None)
names = ['text','sentiment']
test.columns = names
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
train_data = train['sentiment'].values
test_data = test['sentiment'].values
plt.figure(figsize=(12,8))
sns.countplot(x='label', data=train)
plt.title('Distribution of Numbers')
plt.xlabel('Numbers');
print("Number of features used for training: \t", len(train),
"\nNumber of features used for testing: \t", len(test))
# 开始使用随机森林分类器
clf = RandomForestClassifier(n_estimators=100) # 定义决策树的个数为100
# 开始训练,训练的X数据格式为[[]],训练的y值为[]也就是经过ravel后的数据
model = clf.fit(train[:,1:], train[:,0].ravel())
# 然后预测
output = model.predict(test[:,1:]
# 利用
clf = RandomForestClassifier(n_estimators=100) # 100 trees
# 用全部训练数据来做训练
target = train_data[:,0].ravel()
train = train_data[:,1:]
model = clf.fit(train, target)
# 用测试集数据来预测最终结果
output = model.predict(test_data)
print output
# 输出预测结果
pd.DataFrame({"ImageId": range(1, len(output)+1), "Label": output}).to_csv('out.csv', index=False, header=True)
Q2
# 计算准确度
acc = np.mean(output == test[:,0].ravel()) *100
print("The accuracy of the pure RandomForest classifier is: \t", acc, "%")