测试代码
from sklearn.externals.six.moves import zip
import matplotlib.pyplot as plt
from sklearn.datasets import make_gaussian_quantiles
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import sys, time
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sets import Set
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
num2num = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5, "six":6,
"seven":7, "eight":8, "nine":9, "ten":10, "eleven":11, "twelve":12,
"thirteen":13, "fourteen":14, "fifteen":15, "sixteen":16, "draw":17}
def transform(x):
retval = []
for e in x:
e = str(e)
if len(e) > 1:
retval.append(num2num[e])
elif e.isalpha():
retval.append(ord(e) - ord('a'))
elif e.isdigit():
retval.append(int(e))
return retval
def preprocess(filename):
df = pd.read_csv(filename, delimiter=',')
samples = [transform(x) for x in df.values]
a = np.array(samples)
feature_enc = OneHotEncoder()
label_enc = OneHotEncoder()
feature_enc.fit(a[:,:6])
X = feature_enc.transform(a[:,:6]).toarray()
y = a[:,6]
return X, y
def main():
X, y = preprocess("krkopt.data")
n_split = 25000
X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]
print X_train.shape, y_train.shape
classifiers = [
DecisionTreeClassifier(max_depth=5, min_samples_leaf=1),
SVC(gamma=2, C=1),
AdaBoostClassifier(
DecisionTreeClassifier(max_depth=2),
n_estimators=600,
learning_rate=1),
BaggingClassifier(
DecisionTreeClassifier(max_depth=2),
n_estimators=600,
random_state=13),
AdaBoostClassifier(
BaggingClassifier(DecisionTreeClassifier(max_depth=2),
n_estimators=3, random_state=13),
n_estimators=600,
learning_rate=1),
BaggingClassifier(
AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
n_estimators=600, learning_rate=1),
n_estimators=600,
random_state=13),
]
for classifier in classifiers:
start_time = time.time()
classifier.fit(X_train, y_train)
accu = np.mean(cross_val_score(classifier, X_test, y_test,
scoring='accuracy', cv=10, n_jobs=1))
print accu, time.time() - start_time
if __name__ == "__main__":
main()