f-score+svm特征选择过程pipeline


# coding: utf-8

# In[ ]:


# !/use/bin/env python

import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import KFold  
from sklearn import svm
from sklearn.model_selection import train_test_split
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import *
import sklearn.ensemble
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
import itertools
import sys
from sklearn.feature_selection import  f_classif
import warnings
from sklearn.externals import joblib
warnings.filterwarnings('ignore')
path=""
inputname=sys.argv[1]
outputname=sys.argv[2].split(".")[0]
name=outputname
cross_times=sys.argv[3]
cpu_values=sys.argv[4]
distance=sys.argv[5]

# inputname="test.csv"
# outputname="test_2RFH"
# name=outputname
# cross_times=2
# cpu_values=12
# distance=10


# In[ ]:


def performance(labelArr, predictArr):
    #labelArr[i] is actual value,predictArr[i] is predict value
    TP = 0.; TN = 0.; FP = 0.; FN = 0.
    for i in range(len(labelArr)):
        if labelArr[i] == 1 and predictArr[i] == 1:
            TP += 1.
        if labelArr[i] == 1 and predictArr[i] == 0:
            FN += 1.
        if labelArr[i] == 0 and predictArr[i] == 1:
            FP += 1.
        if labelArr[i] == 0 and predictArr[i] == 0:
            TN += 1.
    if (TP + FN)==0:
        SN=0
    else:
        SN = TP/(TP + FN) #Sensitivity = TP/P  and P = TP + FN
    if (FP+TN)==0:
        SP=0
    else:
        SP = TN/(FP + TN) #Specificity = TN/N  and N = TN + FP
    if (TP+FP)==0:
        precision=0
    else:
        precision=TP/(TP+FP)
    if (TP+FN)==0:
        recall=0
    else:
        recall=TP/(TP+FN)
    GM=math.sqrt(recall*SP)
    #MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    return precision,recall,SN,SP,GM,TP,TN,FP,FN


# In[ ]:


"""
    cross validation and f-score and xgboost
"""
datapath =path+inputname
classifier="svm_f-score"
mode="crossvalidation"
print("start")
train_data = pd.read_csv(datapath, header=None, index_col=None)
Y = list(map(lambda x: 1, xrange(len(train_data) // 2)))
Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2)))
Y.extend(Y2)
Y = np.array(Y)
F, pval = f_classif(train_data, Y)
idx = np.argsort(F)
selected_list_=idx[::-1]
F_sort_value=[F[e] for e in selected_list_]

print(F_sort_value)
print(selected_list_)

feature_dimension_and_scores=[]
feature_dimension_and_scores.append(F_sort_value)
feature_dimension_and_scores.append(selected_list_)

feature_dimension_and_scores=np.array(feature_dimension_and_scores).T

pd.DataFrame(feature_dimension_and_scores).to_excel("feature_dimension_and_score.xlsx",index=False,header=["score","dimension"])

print("After filtering Nan and inf")
selected_list_=[a  for a,b in zip(selected_list_,F_sort_value) if (not math.isnan(b) and not math.isinf(b))]
print(selected_list_)


bestACC=0
bestC=0
bestgamma=0
best_dimension=0
row0 = [u'特征集', u'样本个数', u'分类器', u'Accuracy', u'Precision', u'Recall', u'SN', u'SP',
                    u'Gm', u'F_measure', u'F_score', u'MCC', u'ROC曲线面积', u'tp', u'fn', u'fp', u'tn']
all_dimension_results=[]
all_dimension_results.append(row0)
select_list=[]
best_savedata=""
prediction_probability=[]
prediction_probability.append(Y.astype(int))
for select_num,temp_data in enumerate(selected_list_):
    train_data2=train_data.values
    select_list.append(int(temp_data))
    if len(select_list)%distance==0 or len(select_list)==len(selected_list_):
        X_train=pd.DataFrame(train_data2)
        X_train=X_train.iloc[:,select_list]
        X = np.array(X_train)
        svc = svm.SVC()
        parameters = {'kernel': ['rbf'], 'C':map(lambda x:2**x,np.linspace(-2,5,7)), 'gamma':map(lambda x:2**x,np.linspace(-5,2,7))}
        clf = GridSearchCV(svc, parameters, cv=cross_times, n_jobs=cpu_values, scoring='accuracy')
        clf.fit(X, Y)
        C=clf.best_params_['C']
        gamma=clf.best_params_['gamma']
        print("the best C and gamma are:",C,gamma)
        y_predict=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma),X,Y,cv=cross_times,n_jobs=cpu_values)
        ROC_AUC_area=metrics.roc_auc_score(Y, y_predict)
        y_predict_prob=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma,probability=True),X,Y,cv=cross_times,n_jobs=cpu_values,method='predict_proba')
        prediction_probability.append(y_predict_prob[:,1])
#         predict_save=[Y.astype(int),y_predict.astype(int),y_predict_prob[:,1]]
#         predict_save=np.array(predict_save).T
#         pd.DataFrame(predict_save).to_csv(path+classifier+"_"+mode+"_"+outputname+"_"+'predict.csv',header=None,index=False)
        ROC_AUC_area=metrics.roc_auc_score(Y,y_predict)
        ACC=metrics.accuracy_score(Y,y_predict)
        precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict)
        F1_Score=metrics.f1_score(Y, y_predict)
        F_measure=F1_Score
        MCC=metrics.matthews_corrcoef(Y, y_predict)
        pos=TP+FN
        neg=FP+TN
        savedata=[str(select_num+1),"positive:"+str(pos)+","+"negative:"+str(neg),'svm',ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN]
        if ACC>bestACC:
            bestACC=ACC
            bestC=C
            bestgamma=gamma
            best_savedata=savedata
            best_dimension=X.shape[1]
        print savedata
        print X.shape[1]
        all_dimension_results.append(savedata)
print("bestACC",bestACC)
print("bestC",bestC)
print("bestgamma",bestgamma)
print("best_dimension",best_dimension)
print("all_useful_dimensions_number",len(all_dimension_results))
final_X=train_data.values
final_select_list=np.array(selected_list_)
final_select_list=final_select_list[range(best_dimension)]
final_X=pd.DataFrame(final_X)
final_X=final_X.iloc[:,list(final_select_list)]
clf=svm.SVC(kernel='rbf',C=bestC,gamma=bestgamma)
clf.fit(final_X,Y)
joblib.dump(clf,path+classifier+"_"+mode+"_"+outputname+"_"+'.model')

temp=range(1,len(all_dimension_results))
temp=[str(e) for e in temp]
prediction_probability_list=['labels']
prediction_probability_list.extend(temp)
prediction_probability=np.array(prediction_probability).T.tolist()
prediction_probability[:][0]=[int(e) for e in prediction_probability[:][0]]
pd.DataFrame(prediction_probability).to_csv(path+classifier+"_"+mode+"_"+outputname+"_"+'predict.csv',header=prediction_probability_list,index=False)
pd.DataFrame(all_dimension_results).to_excel(path+'cross_validation_'+classifier+"_"+outputname+'.xlsx',sheet_name="_crossvalidation",header=None,index=False)
pd.DataFrame(np.array(all_dimension_results)[[0,best_dimension/distance],:]).to_excel(path+'cross_validation_'+classifier+"_"+outputname+"_"+'best.xlsx',sheet_name="_crossvalidation",header=None,index=False)


example:

python f-score_svm.py input.csv outputname crossvalidation_values cpu_values distance

其中:

  • f-score_svm.py: 程序名字
  • input.csv: 输入文件,以csv结尾
  • outputname: 输出文件的标记
  • crossvalidation_values: 几折交叉验证,如十折交叉验证填:10
  • cpu+values: 多进程数,需要查清自己电脑有多少cpu,建议使用少于自己电脑cpu数的值,如有4个cpu, 可以填3
  • distance:特征选择过程的幅度,一般填1,不可为0

你可能感兴趣的:(f-score+svm特征选择过程pipeline)