import sys, os
import numpy as np
import pandas as pd
from sklearn import svm, cross_validation, decomposition, metrics, manifold
import matplotlib.pyplot as plt
def load_data(fn):
return np.array(pd.read_csv(fn, header=0))
def digital_fea(d):
d1 = np.unique(d)
return dict(zip(list(d1), range(0, d1.size)))
def pre_process(data):
m, n = data.shape
d = np.zeros((m, n-1))
map_1 = digital_fea(data[:, 1])
map_3 = digital_fea(data[:, 3])
map_4 = digital_fea(data[:, 4])
map_6 = digital_fea(data[:, 6])
map_7 = digital_fea(data[:, 7])
map_8 = digital_fea(data[:, 8])
map_9 = digital_fea(data[:, 9])
map_10 = digital_fea(data[:, 10])
map_11 = digital_fea(data[:, 11])
map_12 = digital_fea(data[:, 12])
map_13 = digital_fea(data[:, 13])
map_14 = digital_fea(data[:, 14])
map_15 = digital_fea(data[:, 15])
map_16 = digital_fea(data[:, 16])
map_17 = digital_fea(data[:, 17])
for i in range(0, m):
for j in range(1, n):
if j==1:
d[i][j-1] = map_1[data[i][j]]
elif j==3:
d[i][j-1] = map_3[data[i][j]]
elif j==4:
d[i][j-1] = map_4[data[i][j]]
elif j==6:
d[i][j-1] = map_6[data[i][j]]
elif j==7:
d[i][j-1] = map_7[data[i][j]]
elif j==8:
d[i][j-1] = map_8[data[i][j]]
elif j==9:
d[i][j-1] = map_9[data[i][j]]
elif j==10:
d[i][j-1] = map_10[data[i][j]]
elif j==11:
d[i][j-1] = map_11[data[i][j]]
elif j==12:
d[i][j-1] = map_12[data[i][j]]
elif j==13:
d[i][j-1] = map_13[data[i][j]]
elif j==14:
d[i][j-1] = map_14[data[i][j]]
elif j==15:
d[i][j-1] = map_15[data[i][j]]
elif j==16:
d[i][j-1] = map_16[data[i][j]]
elif j==17:
d[i][j-1] = map_17[data[i][j]]
elif j==20:
d[i][j-1] = 1 if data[i][j] == "Yes" else 0
else:
d[i][j-1] = data[i][j]
return d
def fea_select(data):
m, n = data.shape
d = np.zeros((m, 7))
for i in range(0, m):
d[i, 0] = float(data[i][5])
d[i, 1] = 1 if data[i][6] == 'Yes' else 0
n = 0
for j in range(9, 15):
if data[i][j] == "No internet service":
n = 0
break
elif data[i][j] == "Yes":
n += 1
d[i, 2] = n
if data[i][15] == "Month-to-month":
d[i, 3] = 1
elif data[i][15] == "One year":
d[i, 3] = 12
else:
d[i, 3] = 24
d[i, 4] = float(data[i][18])
d[i, 5] = float(data[i][19]/data[i][18])
d[i, 3] = d[i, 5] / d[i, 3]
d[i, 6] = 1 if data[i][20] == 'Yes' else 0
return d
def normalize(d, mean=None, va=None):
m, n = d.shape
if mean is None:
mean = np.mean(d, axis = 0)
if va is None:
va = np.sqrt(np.diag(np.dot(d.T, d))/m)
return (d - np.tile(mean, (m, 1))) / np.tile(va, (m, 1)), mean, va
def normalize1(d, mx=None, mn=None):
m, n = d.shape
if mx is None:
mx = np.max(d, axis=0)
if mn is None:
mn = np.min(d, axis=0)
return (d - np.tile(mn, (m, 1))) / (np.tile(mx-mn, (m, 1))), mx, mn
def pca(X, y):
fig = plt.figure(figsize=(15, 8))
pca_model = decomposition.PCA(n_components=2)
Y = pca_model.fit_transform(X)
pre = 0
c_opt = 0
g_opt = 0
for c in range(-2, 2, 2):
c = 2 ** c
for g in range(-2, 2, 2):
g = 2 ** g
clf = svm.SVC(kernel='rbf', C=c, gamma='auto', probability=True)
clf.fit(Y, y)
scores = cross_validation.cross_val_score(clf, Y, y, cv=5, n_jobs=3)
acc = float(sum(scores)) / len(scores)
if acc > pre :
pre = acc
c_opt = c
g_opt = g
clf = svm.SVC(kernel='rbf', C=c_opt, gamma='auto', probability=True)
clf.fit(Y,y)
plt.scatter(Y[:, 0], Y[:, 1], c=y, zorder=10, cmap=plt.cm.Paired)
x_min = Y[:, 0].min()
x_max = Y[:, 0].max()
y_min = Y[:, 1].min()
y_max = Y[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k'], linestyles=['-'], levels=[0])
plt.show()
def main(fn):
data = load_data(fn)
d = pre_process(data)
m, n = d.shape
pos = int(m*9/10)
train = d[:pos]
test = d[pos:]
col = list(range(0, n-1))
train_x = train[:, col]
train_y = train[:, n-1].astype(np.int32)
train_x, mean, va = normalize1(train_x)
pca(train_x, train_y)
test_x = test[:, col]
test_x, mean, va = normalize1(test_x, mean, va)
test_y = test[:, n-1].astype(np.int32)
pre = 0
c_opt = 0
g_opt = 0
for c in range(-2, 4, 2):
c = 2 ** c
for g in range(-4, 0, 2):
g = 2 ** g
clf = svm.SVC(kernel='rbf', C=c, gamma=g, probability=True)
clf.fit(train_x, train_y)
scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=5, n_jobs=3)
acc = float(sum(scores)) / len(scores)
if acc > pre:
pre = acc
c_opt = c
g_opt = g
clf = svm.SVC(kernel='rbf', C=c_opt, gamma=g_opt, probability=True)
clf.fit(train_x, train_y)
print("train score:", clf.score(train_x, train_y))
print("test score:", clf.score(test_x, test_y))
train_score = clf.decision_function(train_x)
test_score = clf.decision_function(test_x)
fpr1, tpr1, thds1 = metrics.roc_curve(train_y, train_score)
roc_auc1 = metrics.auc(fpr1, tpr1)
fpr, tpr, thds = metrics.roc_curve(test_y, test_score)
roc_auc = metrics.auc(fpr, tpr)
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot(fpr1, tpr1, color='r',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc1)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
train_prob = clf.predict_proba(train_x)
test_prob = clf.predict_proba(test_x)
pre_f1 = 0
f1_score = 0
train_pred = np.zeros(train_y.size)
print(train_pred)
for i in np.arange(0, 1, 0.05):
for j in range(0, train_y.size):
train_pred[j] = 1 if train_prob[j, 1] > i else 0
f1 = metrics.f1_score(train_y, train_pred)
if f1 > pre_f1:
thd = i
pre_f1 = f1
test_pred = np.zeros(test_y.size).astype(np.int32)
for i in range(0, test_y.size):
test_pred[i] = 1 if test_prob[i, 1] > thd else 0
print(test_y)
print(test_pred)
f1 = metrics.f1_score(test_y, test_pred)
p = metrics.precision_score(test_y, test_pred)
r = metrics.recall_score(test_y, test_pred)
print("train f score: ", pre_f1, "thd: ", thd)
print("test f score:", f1, ", P:", p, ", R:", r)
train_pred = clf.predict(train_x)
train_f1 = metrics.f1_score(train_y, train_pred)
train_p = metrics.precision_score(train_y, train_pred)
train_r = metrics.recall_score(train_y, train_pred)
test_pred = clf.predict(test_x)
test_f1 = metrics.f1_score(test_y, test_pred)
test_p = metrics.precision_score(test_y, test_pred)
test_r = metrics.recall_score(test_y, test_pred)
print("train precision:", train_p, " recall:", train_r, " f score:", train_f1)
print("test precision:", test_p, " recall:", test_r, " f score:", test_f1)
if __name__ == "__main__":
main(sys.argv[1])