In python (using the metrics module of scikit-learn):
fpr, tpr, thresholds = metrics.roc_curve(true_labels, predictions, pos_label=1)
auc = metrics.auc(fpr,tpr)
在信号检测理论中,接收者操作特征曲线(receiver operating characteristic curve,或者叫ROC曲线)是一种座标图式的分析工具,用于 :
(1) 选择最佳的信号侦测模型、舍弃次佳的模型。
(2) 在同一模型中设定最佳阈值。
从 (0, 0) 到 (1,1)的对角线将ROC空间划分为左上/右下两个区域,在这条线的以上的点代表了一个好的分类结果(胜过随机分类),而在这条线以下的点代表了差的分类结果(劣于随机分类)。
- 完美的预测是一个在左上角的点,在ROC空间座标 (0,1)点,X=0 代表着没有伪阳性,Y=1 代表着没有伪阴性。
import pandas as pd
data = pd.read_csv('train.csv')
print data.describe()
# load train data
import pandas as pd
import numpy as np
data = pd.read_csv('train.csv')
num = len(data)
data_matrix = data.as_matrix()
train = np.zeros((num*2,12))
for i in range(num):
if data_matrix[i][0] == 1.: # the first 5500 is '1'
train[i][0] = 1
train[i][1:] = data_matrix[i][1:12]
train[i+num][0] = 0
train[i+num][1:] = data_matrix[i][12:23]
else: # the last 5500 is '0'
train[i][0] = 1
train[i][1:] = data_matrix[i][12:23]
train[i+num][0] = 0
train[i+num][1:] = data_matrix[i][1:12]
train = pd.DataFrame(data=train, index=range(2*num), columns=data.axes[1][0:12])
print train.describe()
# normalize the data
from sklearn import preprocessing
index,columns = train.axes
train = preprocessing.scale(train)
train = pd.DataFrame(data=train, index=index, columns=columns)
print train.describe()
# select better feature
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
features = train.axes[1][1:12]
# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(train[features], train["Choice"])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
# Plot the scores.
plt.bar(range(len(features)), scores)
plt.xticks(range(len(features)), features, rotation='vertical')
features = ["A_follower_count","A_listed_count","A_mentions_sent","A_retweets_sent","A_posts","A_network_feature_1","A_network_feature_2","A_network_feature_3"]
1)由于是二分类问题,首先尝试用最简单的线性回归(Linear regression)。
# Linear regression
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation from sklearn.cross_validation import KFold # Initialize our algorithm class alg = LinearRegression() # Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test. # We set random_state to ensure we get the same splits every time we run this. kf = KFold(train.shape[0], n_folds=5, shuffle=True) predictions = [] for train_item, test_item in kf: # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds. train_predictors = (train[features].iloc[train_item,:]) # The target we're using to train the algorithm. train_target = train["Choice"].iloc[train_item] # Training the algorithm using the predictors and target. alg.fit(train_predictors, train_target) # We can now make predictions on the test fold test_predictions = alg.predict(train[features].iloc[test_item,:]) predictions.append(test_predictions) # Linear regression 评估 # The predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis. predictions = np.concatenate(predictions, axis=0) # Map predictions to outcomes (only possible outcomes are 1 and 0) predictions[predictions > .5] = 1. predictions[predictions <=.5] = .0 accuracy = .0 for i in range(len(predictions)): if predictions[i] == train["Choice"][i]: accuracy += 1. accuracy /= len(predictions) print accuracy
- output: 0.0250909090909
2)不要放弃,开始尝试随机森林(Random forest )
# Random forest
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)
# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, train[features], train["Choice"], cv=3)
# Take the mean of the scores (because we have one for each fold)
- output:0.691273202642