def features_get(coef):
w = np.argsort(-abs(coef))
w_des = []
for i in range(len(w)):
w_des.append(x_inf[w[i]])
string = x_inf[w[i]] +' ' + str(coef[w[i]])
print(string)
def box_plot_outliers_1(df1, box_scale):
iqr = box_scale*(df1.quantile(0.75) - df1.quantile(0.25))
val_low = df1.quantile(0.25) - iqr*1.5
val_up = df1.quantile(0.75) + iqr*1.5
outlier = df1[(df1 < val_low) | (df1 > val_up)]
df1 = pd.DataFrame(np.where(df1 > val_up, val_up, df1))
df1.columns = val_up.index
df1 = pd.DataFrame(np.where(df1 < val_low, val_low, df1))
df1.columns = val_low.index
return df1
def index_get(predict_test, test_label):
tp_inx = []
tn_inx = []
fp_inx = []
fn_inx = []
labe = test_label
for i in range(len(labe)):
if labe[i] == 1 and predict_test[i] == 1:
tp_inx.append(i)
elif labe[i] == 1 and predict_test[i] != 1:
fn_inx.append(i)
elif labe[i] == 0 and predict_test[i] == 0:
tn_inx.append(i)
elif labe[i] == 0 and predict_test[i] != 0:
fp_inx.append(i)
accuracy = (len(tp_inx) + len(tn_inx))/len(labe)
if (len(tp_inx) + len(fp_inx)) != 0:
precision = len(tp_inx)/(len(tp_inx) + len(fp_inx))
else:
precision = -1
if (len(tp_inx) + len(fn_inx)) != 0:
recall = len(tp_inx)/(len(tp_inx) + len(fn_inx))
else:
recall = -1
if precision != 0 and recall != 0 :
F1 = 2 / (1/precision + 1/recall)
else:
F1 = -1
print('TP: ' + str(len(tp_inx)))
print('TN: ' + str(len(tn_inx)))
print('FP: ' + str(len(fp_inx)))
print('FN: ' + str(len(fn_inx)))
print('accuracy: ' + str(accuracy))
print('precision: ' + str(precision))
print('recall: ' + str(recall))
print('F1: ' + str(F1))
def corr_get(dataframeX,dataframeY):
column_name = dataframeX.columns.values.tolist()
result = pd.DataFrame(columns = ['r_s','p_s'])
result1 = pd.DataFrame(columns = ['r_p','p_p'])
for i in column_name:
result.loc[i] = stats.spearmanr(dataframeX[i], dataframeY)
result1.loc[i] = stats.pearsonr(dataframeX[i], dataframeY)
result['Name'] = result.index
result1['Name1'] = result1.index
rb = result.sort_values(by = 'p_s')
rb1 = result1.sort_values(by = 'p_p')
return rb,rb1
def drop_null(dataframe, perc):
drop_col = []
null_count = dataframe.isna().sum()
null_drop = null_count.index[null_count > dataframe.shape[0]*perc]
result = dataframe.drop(null_drop,axis = 1)
return result
def fill_null(dataframe):
dataframe_mean = dataframe.mean()
for i in [i for i in dataframe_mean.index if i != 'Bin645']:
dataframe[i] = dataframe[i].fillna(dataframe_mean[i])
return dataframe
def box_plot_outliers_1(df1, box_scale):
iqr = box_scale * (df1.quantile(0.75) - df1.quantile(0.25))
val_low = df1.quantile(0.25) - iqr*1.5
val_up = df1.quantile(0.75) + iqr*1.5
outlier = df1[(df1 < val_low) | (df1 > val_up)]
df1 = pd.DataFrame(np.where(df1 > val_up, val_up, df1))
df1.columns = val_up.index
df1 = pd.DataFrame(np.where(df1 < val_low, val_low, df1))
df1.columns = val_low.index
return df1
def drop_unique(dataframe,count):
unique_count = dataframe.nunique()
unique_drop = unique_count.index[unique_count <= count ]
result = dataframe.drop(unique_drop,axis = 1)
return result
def data_banlance(train_data0,train_label0, num1_count, num0_count):
data1 = train_data0[train_label0 == 1]
label1 = train_label0[train_label0 == 1]
data0 = train_data0[train_label0 == 0]
label0 = train_label0[train_label0 == 0]
rd1 = np.random.randint(len(data1), size = len(data1) * num1_count)
rd0 = np.random.randint(len(data0), size = len(data1) * num0_count)
data_new1 = np.vstack((data1[rd1],data0[rd0]))
label_new1 = np.hstack((label1[rd1],label0[rd0]))
return data_new1,label_new1