4划分0_1标签Analysis

 1、用kmeans对标签进行分类

import pandas as pd
from sklearn.cluster import KMeans 
from matplotlib import pyplot as plt
import numpy as np

 2、

path = 'C:/Users/S/Desktop/其他案例数据/UF_RATIO(EAC1)/data/4CleanData--csv大宽表+0-1标签/csvAll.csv'
df = pd.read_csv(path,encoding='utf8')
df
ratio = df.loc[:,'UF_RATIO']
a = np.array(ratio).reshape(-1,1)
a
kmeans = KMeans(n_clusters=2, random_state=10).fit(a)
df22= pd.DataFrame(columns=['ratio','label'])
df22.loc[:,'ratio'] = ratio
df22.loc[:,'label'] = kmeans.labels_
kmeans.labels_
df33 = df22.sort_values('ratio')
df33
for i in range(len(df33)):
    if i != len(df33):
        if df33.iloc[i,:][1] == 1 and df33.iloc[i+1,:][1] == 0:
            print('ratio1:',df33.iloc[i,:][0],' ratio2:',df33.iloc[i+1,:][0])
            threshold = (df33.iloc[i,:][0] + df33.iloc[i+1,:][0]) / 2
            print('阈值:',threshold)
plt.scatter(a,kmeans.labels_)
df.insert(loc=3,column='label',value='NaN')
threshold
for i in range(len(df)):
    if df.loc[i,'UF_RATIO'] < threshold:
        df.loc[i,'label'] = 0
    else:
        df.loc[i,'label'] = 1
df.loc[:,'label'] = df.loc[:,'label'].astype('int64')
df.to_csv('C:/Users/S/Desktop/其他案例数据/UF_RATIO(EAC1)/data/4CleanData--csv大宽表+0-1标签/UF_RATIO-'+ str(threshold)+ '.csv',index=False,encoding='utf_8_sig')

你可能感兴趣的:(对多json文件进行数据分析,python)