svm简单应用

#!/bin/python3

import sys, os
import numpy as np
import pandas as pd
from sklearn import svm, cross_validation, decomposition, metrics, manifold
import matplotlib.pyplot as plt

def load_data(fn):
    return np.array(pd.read_csv(fn, header=0))

def digital_fea(d):
    d1 = np.unique(d)
    return dict(zip(list(d1), range(0, d1.size)))

def pre_process(data):
    m, n = data.shape
    d = np.zeros((m, n-1))

    map_1 = digital_fea(data[:, 1])
    map_3 = digital_fea(data[:, 3])
    map_4 = digital_fea(data[:, 4])
    map_6 = digital_fea(data[:, 6])
    map_7 = digital_fea(data[:, 7])
    map_8 = digital_fea(data[:, 8])
    map_9 = digital_fea(data[:, 9])
    map_10 = digital_fea(data[:, 10])
    map_11 = digital_fea(data[:, 11])
    map_12 = digital_fea(data[:, 12])
    map_13 = digital_fea(data[:, 13])
    map_14 = digital_fea(data[:, 14])
    map_15 = digital_fea(data[:, 15])
    map_16 = digital_fea(data[:, 16])
    map_17 = digital_fea(data[:, 17])
    for i in range(0, m):
        for j in range(1, n):
            if j==1:
                d[i][j-1] = map_1[data[i][j]]
            elif j==3:
                d[i][j-1] = map_3[data[i][j]]
            elif j==4:
                d[i][j-1] = map_4[data[i][j]]
            elif j==6:
                d[i][j-1] = map_6[data[i][j]]
            elif j==7:
                d[i][j-1] = map_7[data[i][j]]
            elif j==8:
                d[i][j-1] = map_8[data[i][j]]
            elif j==9:
                d[i][j-1] = map_9[data[i][j]]
            elif j==10:
                d[i][j-1] = map_10[data[i][j]]
            elif j==11:
                d[i][j-1] = map_11[data[i][j]]
            elif j==12:
                d[i][j-1] = map_12[data[i][j]]
            elif j==13:
                d[i][j-1] = map_13[data[i][j]]
            elif j==14:
                d[i][j-1] = map_14[data[i][j]]
            elif j==15:
                d[i][j-1] = map_15[data[i][j]]
            elif j==16:
                d[i][j-1] = map_16[data[i][j]]
            elif j==17:
                d[i][j-1] = map_17[data[i][j]]
            elif j==20:
                d[i][j-1] = 1 if data[i][j] == "Yes" else 0
            else:
                d[i][j-1] = data[i][j]
    return d

def fea_select(data):
    m, n = data.shape
    d = np.zeros((m, 7))
    for i in range(0, m):
        #d[i, 0] = 1 if data[i][1] == 'Male' else 0
        #d[i, 1] = float(data[i][2])
        #d[i, 2] = 1 if data[i][3] == 'Yes'  else 0
        #d[i, 3] = 1 if data[i][4] == 'Yes'  else 0
        d[i, 0] = float(data[i][5])
        d[i, 1] = 1 if data[i][6] == 'Yes'  else 0
        n = 0
        for j in range(9, 15):
            if data[i][j] == "No internet service":
                n = 0
                break
            elif data[i][j] == "Yes":
                n += 1
        d[i, 2] = n
        if data[i][15] == "Month-to-month":
            d[i, 3] = 1
        elif data[i][15] == "One year":
            d[i, 3] = 12
        else:
            d[i, 3] = 24
        d[i, 4] = float(data[i][18])
        d[i, 5] = float(data[i][19]/data[i][18])
        d[i, 3] = d[i, 5] / d[i, 3]
        d[i, 6] = 1 if data[i][20] == 'Yes' else 0

    #print(np.max(d[:, 6]))
    return d

def normalize(d, mean=None, va=None):
    m, n = d.shape
    if mean is None:
        mean = np.mean(d, axis = 0)
    if va is None:
        va = np.sqrt(np.diag(np.dot(d.T, d))/m)

    return (d - np.tile(mean, (m, 1))) / np.tile(va, (m, 1)), mean, va

def normalize1(d, mx=None, mn=None):
    m, n = d.shape
    if mx is None:
        mx = np.max(d, axis=0)
    if mn is None:
        mn = np.min(d, axis=0)

    return (d - np.tile(mn, (m, 1))) / (np.tile(mx-mn, (m, 1))), mx, mn

def pca(X, y):
    fig = plt.figure(figsize=(15, 8))
    pca_model = decomposition.PCA(n_components=2)
    #pca_model = manifold.TSNE(n_components=2, init='pca', random_state=0)
    Y = pca_model.fit_transform(X)

    pre = 0
    c_opt = 0
    g_opt = 0

    for c in range(-2, 2, 2):
        c = 2 ** c
        for g in range(-2, 2, 2):
            g = 2 ** g

            clf = svm.SVC(kernel='rbf', C=c, gamma='auto', probability=True)
            clf.fit(Y, y)
            #acc = clf.score(Y, y)
            # cross validation
            scores = cross_validation.cross_val_score(clf, Y, y, cv=5, n_jobs=3)
            acc = float(sum(scores)) / len(scores)

            if acc > pre : 
                pre = acc
                c_opt = c
                g_opt = g
    clf = svm.SVC(kernel='rbf', C=c_opt, gamma='auto', probability=True)
    clf.fit(Y,y)

    plt.scatter(Y[:, 0], Y[:, 1], c=y, zorder=10, cmap=plt.cm.Paired)

    x_min = Y[:, 0].min()
    x_max = Y[:, 0].max()
    y_min = Y[:, 1].min()
    y_max = Y[:, 1].max()
    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])

    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
    plt.contour(XX, YY, Z, colors=['k'], linestyles=['-'], levels=[0])
    plt.show()

def main(fn):
    data = load_data(fn)
    d = pre_process(data)
    #d = fea_select(data)

    #d, mean, varn = normalize(d)
    #d,mx,mn = normalize1(d)
    m, n = d.shape
    pos = int(m*9/10)

    train = d[:pos]
    test = d[pos:]

    col = list(range(0, n-1))
    #col = [0,2,4,5,6,7,8,9,11,14,17,18]
    train_x = train[:, col]
    train_y = train[:, n-1].astype(np.int32)
    train_x, mean, va = normalize1(train_x)

    pca(train_x, train_y)
    test_x = test[:, col]
    test_x, mean, va = normalize1(test_x, mean, va)
    test_y = test[:, n-1].astype(np.int32)

    pre = 0
    c_opt = 0
    g_opt = 0

    for c in range(-2, 4, 2):
        c = 2 ** c
        for g in range(-4, 0, 2):
            g = 2 ** g
            clf = svm.SVC(kernel='rbf', C=c, gamma=g, probability=True)
            clf.fit(train_x, train_y)

            scores = cross_validation.cross_val_score(clf, train_x, train_y, cv=5, n_jobs=3)
            acc = float(sum(scores)) / len(scores)

            if acc > pre:
                pre = acc
                c_opt = c
                g_opt = g

    clf = svm.SVC(kernel='rbf', C=c_opt, gamma=g_opt, probability=True)
    clf.fit(train_x, train_y)

    print("train score:", clf.score(train_x, train_y))
    print("test score:", clf.score(test_x, test_y))

    train_score = clf.decision_function(train_x)
    test_score = clf.decision_function(test_x)
    #print(test_score)

    fpr1, tpr1, thds1 = metrics.roc_curve(train_y, train_score)
    roc_auc1 = metrics.auc(fpr1, tpr1)
    fpr, tpr, thds = metrics.roc_curve(test_y, test_score)
    roc_auc = metrics.auc(fpr, tpr)

    lw = 2
    plt.figure(figsize=(10,10))
    plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
    plt.plot(fpr1, tpr1, color='r',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc1) ###假正率为横坐标,真正率为纵坐标做曲线
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    train_prob = clf.predict_proba(train_x)
    test_prob = clf.predict_proba(test_x)

    pre_f1 = 0
    f1_score = 0
    train_pred = np.zeros(train_y.size)
    print(train_pred)
    for i in np.arange(0, 1, 0.05):
        for j in range(0, train_y.size):
            train_pred[j] = 1 if train_prob[j, 1] > i else 0
        f1 = metrics.f1_score(train_y, train_pred)
        if f1 > pre_f1:
            thd = i
            pre_f1 = f1

    test_pred = np.zeros(test_y.size).astype(np.int32)
    for i in range(0, test_y.size):
        test_pred[i] = 1 if test_prob[i, 1] > thd else 0
    print(test_y)
    print(test_pred)
    f1 = metrics.f1_score(test_y, test_pred)
    p = metrics.precision_score(test_y, test_pred)
    r = metrics.recall_score(test_y, test_pred)
    print("train f score: ", pre_f1, "thd: ", thd)
    print("test f score:", f1, ", P:", p, ", R:", r)

    train_pred = clf.predict(train_x)
    train_f1 = metrics.f1_score(train_y, train_pred)
    train_p = metrics.precision_score(train_y, train_pred)
    train_r = metrics.recall_score(train_y, train_pred)

    test_pred = clf.predict(test_x)
    test_f1 = metrics.f1_score(test_y, test_pred)
    test_p = metrics.precision_score(test_y, test_pred)
    test_r = metrics.recall_score(test_y, test_pred)

    print("train precision:", train_p, " recall:", train_r, " f score:", train_f1)
    print("test precision:", test_p, " recall:", test_r, " f score:", test_f1)

if __name__ == "__main__":
    main(sys.argv[1])

你可能感兴趣的:(python,自然语言处理)