Boxplot

def features_get(coef):
    w = np.argsort(-abs(coef))
    w_des = []
    for i in range(len(w)):
        w_des.append(x_inf[w[i]])
        string = x_inf[w[i]] +'   ' + str(coef[w[i]])
        print(string)
    #print(sum(coef[w[:20]]))
    
def box_plot_outliers_1(df1, box_scale):
    iqr = box_scale*(df1.quantile(0.75) - df1.quantile(0.25))
    val_low = df1.quantile(0.25) - iqr*1.5
    val_up  = df1.quantile(0.75) + iqr*1.5
    outlier = df1[(df1 < val_low) | (df1 > val_up)]
    df1 = pd.DataFrame(np.where(df1 > val_up, val_up, df1))
    df1.columns = val_up.index
    df1 = pd.DataFrame(np.where(df1 < val_low, val_low, df1))
    df1.columns = val_low.index
    return df1

def index_get(predict_test, test_label):
    tp_inx = []
    tn_inx = []
    fp_inx = []
    fn_inx = []
    #predict_test = cl7.predict(test_data7)
    labe = test_label
    for i in range(len(labe)):
        if   labe[i] == 1 and predict_test[i] == 1:
            tp_inx.append(i)
        elif labe[i] == 1 and predict_test[i] != 1:
            fn_inx.append(i)
        elif labe[i] == 0 and predict_test[i] == 0:
            tn_inx.append(i)
        elif labe[i] == 0 and predict_test[i] != 0:
            fp_inx.append(i)

    accuracy = (len(tp_inx) + len(tn_inx))/len(labe)

    if (len(tp_inx) + len(fp_inx)) != 0:
        precision = len(tp_inx)/(len(tp_inx) + len(fp_inx))
    else:
        precision = -1
        
    if (len(tp_inx) + len(fn_inx)) != 0:
        recall = len(tp_inx)/(len(tp_inx) + len(fn_inx))   
    else:
        recall = -1
        
    if precision != 0 and recall != 0 :
        F1 = 2 / (1/precision + 1/recall)
    else:
        F1 = -1
        
    print('TP: ' + str(len(tp_inx)))
    print('TN: ' + str(len(tn_inx)))
    print('FP: ' + str(len(fp_inx)))
    print('FN: ' + str(len(fn_inx)))
    print('accuracy: ' + str(accuracy))
    print('precision: ' + str(precision))
    print('recall: ' + str(recall))
    print('F1: ' + str(F1))
###########################correlation############################
def corr_get(dataframeX,dataframeY):
    column_name = dataframeX.columns.values.tolist()
    #result = pd.DataFrame(columns = column_name)
    result = pd.DataFrame(columns = ['r_s','p_s'])
    result1 = pd.DataFrame(columns = ['r_p','p_p'])
    for i in column_name:
        result.loc[i] = stats.spearmanr(dataframeX[i], dataframeY)
        result1.loc[i] = stats.pearsonr(dataframeX[i], dataframeY)

    result['Name'] = result.index
    result1['Name1'] = result1.index
    rb = result.sort_values(by = 'p_s')
    rb1 = result1.sort_values(by = 'p_p')
    return rb,rb1

def drop_null(dataframe, perc):
    drop_col = []
    null_count = dataframe.isna().sum()
    null_drop = null_count.index[null_count > dataframe.shape[0]*perc]
    result = dataframe.drop(null_drop,axis = 1)  
    return result

def fill_null(dataframe):
    dataframe_mean = dataframe.mean()
    for i in [i for i in dataframe_mean.index if i != 'Bin645']:
        dataframe[i] = dataframe[i].fillna(dataframe_mean[i])
    return dataframe

def box_plot_outliers_1(df1, box_scale):
    iqr = box_scale * (df1.quantile(0.75) - df1.quantile(0.25))
    val_low = df1.quantile(0.25) - iqr*1.5
    val_up = df1.quantile(0.75) + iqr*1.5
    outlier = df1[(df1 < val_low) | (df1 > val_up)]
    df1 = pd.DataFrame(np.where(df1 > val_up, val_up, df1))
    df1.columns = val_up.index
    df1 = pd.DataFrame(np.where(df1 < val_low, val_low, df1))
    df1.columns = val_low.index
    return df1

def drop_unique(dataframe,count):
    unique_count = dataframe.nunique()
    unique_drop = unique_count.index[unique_count <= count ]
    result = dataframe.drop(unique_drop,axis = 1)
    return result

##############################  data_banlance  ###########################
def data_banlance(train_data0,train_label0, num1_count, num0_count):
    data1 = train_data0[train_label0 == 1]
    label1 = train_label0[train_label0 == 1]
    data0 = train_data0[train_label0 == 0]
    label0 = train_label0[train_label0 == 0]
    # data balance
    rd1 = np.random.randint(len(data1), size = len(data1) * num1_count)
    rd0 = np.random.randint(len(data0), size = len(data1) * num0_count)

    data_new1 = np.vstack((data1[rd1],data0[rd0]))
    label_new1 = np.hstack((label1[rd1],label0[rd0]))
    return data_new1,label_new1

你可能感兴趣的:(Python)