先附上代码,学习笔记回头再补充
完整代码如下:
# -*- coding: utf-8 -*- # 关闭警告 # import warnings # warnings.filterwarnings('ignore') import pandas as pd import numpy as np from scipy.spatial.distance import cdist from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_validate from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.cluster import KMeans from sklearn import preprocessing as prep import matplotlib.pyplot as plt class Cluster(): # #数据读取 def data_read(self, data_path, file_name, typeid): ''' 数据读取 :param data_path: 文件存储路径 :param file_name: 文件名 :param typeid: 价位段 :return: 价位段原始数据和价位段去无关变量数据 ''' data = pd.read_excel(data_path + file_name, index_col='pack_bar').dropna() # 删除缺失记录 data1_type = data[data['typeid'].isin(typeid)] # 取出价位段内记录 # data1_type = data1_type[data1_type['ccom_id'].isin([11110001])] # 取出某地市记录 data_type = data1_type.drop(['typeid', 'month_double', 'ccom_id', 'net_month_double'], 1) # ,'net_date'删除无关自变量 return data1_type, data_type def outlier_filtrate(self,data_type,method='std',fill='nan',threshold=1): ''' 异常值处理机制 :param data_type: 原始数据 :param method: 处理方法,{'std':'正态异常','quantile':'箱线异常','threshold':'定值异常'} :param fill: 值填充方法,{'nan':'空值','extremum':'极值替换'} :param threshold: 异常值判断阈值,仅当method是threshold有效 :return:新数据 ''' ncol = data_type.shape[1] colnames = data_type.columns colnames2 = list(filter(lambda x:x.find('_incr')>0,colnames)) # 仅判断增长率数据 data2_type = data_type.copy() for i in range(ncol): datai = data2_type.iloc[:,i] # 正态异常 if method == 'std': xmean = datai.mean() xstd = datai.std() up = xmean + 2 * xstd dw = xmean - 2 * xstd if any(datai > up): print('存在上限异常值') if fill == 'nan': data2_type.iloc[:, i][datai > up] = np.nan else: data2_type.iloc[:, i][datai > up] = datai[datai < up].max() else: print('不存在上限异常值') if any(datai < dw): print('存在下限异常值') if fill == 'nan': data2_type.iloc[:, i][datai < dw] = np.nan else: data2_type.iloc[:, i][datai < dw] = datai[datai < dw].min() else: print('不存在下限异常值') # 箱线图异常 if method == 'quantile': q1 = datai.quantile(0.25) q3 = datai.quantile(0.75) up = q3 + 1.5 * (q3 - q1) dw = q1 - 1.5 * (q3 - q1) if any(datai > up): print('存在上限异常值') if fill == 'nan': data2_type.iloc[:, i][datai > up] = np.nan else: data2_type.iloc[:, i][datai > up] = datai[datai < up].max() else: print('不存在上限异常值') if any(datai < dw): print('存在下限异常值') if fill == 'nan': data2_type.iloc[:, i][datai < dw] = np.nan else: data2_type.iloc[:, i][datai < dw] = datai[datai < dw].min() else: print('不存在下限异常值') # 超过阈值异常 if method == 'threshold': # 箱线图监测 if colnames2.__contains__(colnames[i]): up = threshold dw = (-1.0)*threshold if any(datai > up): print('存在上限异常值') if fill == 'nan': data2_type.iloc[:, i][datai > up] = np.nan else: data2_type.iloc[:, i][datai > up] = up else: print('不存在上限异常值') if any(datai < dw): print('存在下限异常值') if fill == 'nan': data2_type.iloc[:, i][datai < dw] = np.nan else: data2_type.iloc[:, i][datai < dw] = dw else: print('不存在下限异常值') # temp = abs(data2_type[colnames2]) <= threshold # 判断是否异常 # lab = temp.apply(lambda x: x.min(), axis=1) # 每行只要有异常值就为False data2_type = data2_type.dropna() # 删除增长率在1以上的记录 return data2_type def corr_filtrate(self, data_type, thred_corr=0.4): ''' 根据相关性阈值筛选变量 :param data_type:原数据 :param thred_corr:相关性阈值 :return:新数据 ''' corrX = data_type.corr() colnames = data_type.columns colnames3 = list() for j in range(corrX.shape[1] - 1): # 删除相关系数大于0.5的变量 for i in range(j + 1, corrX.shape[0] - 1): if abs(corrX.iloc[i, j]) >= thred_corr: if np.mean(corrX.iloc[i, :]) < np.mean(corrX.iloc[:, j]): # 去掉其中平均绝对相关系数较大的那一个 colnames3.append(colnames[j]) else: colnames3.append(colnames[i]) break colnames4 = colnames.drop(list(set(colnames3))) data2_type = data_type[colnames4] return data2_type def vif_filtrate(self,data2_type, thred_vif=4): ''' 膨胀因子阈值筛选变量 :param data2_type: 原数据 :param thred_vif: 膨胀因子阈值 :return: 新数据 ''' vif = [round(variance_inflation_factor(data2_type.values, i),2) for i in range(data2_type.shape[1])] # 共线性检验 data3_type = data2_type.copy() while sum(list(map(lambda x:x>=thred_vif,vif))) > 0: colnames = data3_type.columns[:-1] for i in range(vif.__len__()-1): # 删除共线性较强的变量 if vif[i] >= thred_vif : data3_type = data3_type.drop(colnames[i], 1) vif = [round(variance_inflation_factor(data3_type.values, i), 2) for i in range(data3_type.shape[1])] # 共线性检验 break return data3_type def data_scale(self, data3_type, method='normalize'): # 数据标准化 ''' 数据标准化(归一化) :param data3_type: 原数据 :param method: 标准化方法,{'minmax':'0-1标准化', 'z-score':'正态标准化', 'normalize':'归一化' 'maxabs':'缩放比例为绝对值最大值,并保留正负号', 'robust':'四分之一和四分之三分位点之间'} :return: 新数据 ''' if method == 'minmax': # 0-1标准化 data_minmax = prep.minmax_scale(data3_type, feature_range=(0, 1), axis=0, copy=True) # 直接用标准化函数 data_scale = pd.DataFrame(data=data_minmax, columns=data3_type.columns, index=data3_type.index) elif method == 'z-score': # z-score标准化 data_zs = prep.scale(data3_type, axis=0, with_mean=True, with_std=True, copy=True) # 直接用标准化函数 data_scale = pd.DataFrame(data=data_zs, columns=data3_type.columns, index=data3_type.index) elif method == 'normalize': # 归一化处理 data_norm = prep.normalize(data3_type, norm='l2', axis=1) # 直接用标准化函数 data_scale = pd.DataFrame(data=data_norm, columns=data3_type.columns, index=data3_type.index) elif method == 'maxabs': # 数据的缩放比例为绝对值最大值,并保留正负号,即在区间[-1, 1]内。唯一可用于稀疏数据scipy.sparse的标准化 data_ma = prep.maxabs_scale(data3_type, axis=0, copy=True) data_scale = pd.DataFrame(data=data_ma, columns=data3_type.columns, index=data3_type.index) elif method == 'robust': # 通过 Interquartile Range(IQR) 标准化数据,即四分之一和四分之三分位点之间 data_rb = prep.robust_scale(data3_type, axis=0, with_centering=True, with_scaling=True, copy=True) data_scale = pd.DataFrame(data=data_rb, columns=data3_type.columns, index=data3_type.index) data4_type = data_scale return data4_type def data_factor(self, data4_type, replace='dependent', threshold=0.05, colnames=None): ''' 目的:数据二值化 :param data3_type: 原数据 :param replace: 替换的列,{'all':'all','dependent':'因变量','colnames':'自己输入变量名'} :param threshold:二值化阈值 :param colnames:list类型,存储列名,仅当replace值为colnames时有效 :return:新数据 ''' data5_type = data4_type.copy() nrow = data5_type.shape[0] if replace == 'all': # 所有变量二值化 data_binary = prep.binarize(data4_type, threshold=threshold, copy=True) # 按照阈值threshold将数据转换成成0-1,小于等于threshold为 0 data_new = pd.DataFrame(data=data_binary, columns=data5_type.columns, index=data5_type.index) data5_type = data_new elif replace == 'dependent': # 因变量二值化 for i in range(nrow): value = 1 if data5_type.iloc[i, -1] > threshold else 0 data5_type.iloc[i, -1] = value elif replace == 'colnames': # 指定变量二值化 temp = data5_type[colnames] if colnames.__len__() > 1: data_binary = prep.binarize(temp, threshold=threshold, copy=True) # 按照阈值threshold将数据转换成成0-1,小于等于threshold为 0 data5_type[colnames] = pd.DataFrame(data=data_binary, columns=temp.columns, index=temp.index) else: for i in range(nrow): value = 1 if temp.values[i] > threshold else 0 data5_type[colnames].values[i] = value # # 亚编码操作 # 打印二值化后数据分布 print(data5_type.iloc[:, -1].value_counts()) # encoder = prep.OneHotEncoder() # X_OH = encoder.fit_transform(data3_type) # # df = pd.DataFrame(X_OH.toarray()) return data5_type def kmeans_cluster(self, data5_type): # #######输入参数######## # data_type:价位段内相关数据 # ####################### data_train, data_test = train_test_split(data5_type,test_size=0.2) # random_state=1234 col_names = data_train.columns X = data_train[col_names[:-1]] K = range(1, 10) meandistortion = [] for k in K: kmeans = KMeans(n_clusters=k) kmeans.fit(X) meandistortion.append((sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))) / X.shape[0]) plt.subplot(2, 1, 2) plt.plot(K, meandistortion, 'bx-') plt.xlabel('k') plt.ylabel(u'centers') plt.title(u'Choose Best k') plt.show() kmeans = KMeans(n_clusters=2) kmeans.fit(X) X['cluster'] = kmeans.labels_ X['cluster'].value_counts() centers = kmeans.cluster_centers_ return centers def data_predict(self,data,colnames,estm): # #######输入参数######## # data_type:价位段内相关数据; # model:用来做预测的分析模型 # ####################### # # 利用回归模型预测优先投放城市 data_new = data[data['year1']==2016] data_new2 = data_new.drop(['year1', 'year', 'type_id'], 1) # 删除无关变量 X = data_new2[colnames] # data_g = self.data_group_byType(data_new2) predictY = estm.predict(X) result = pd.Series(index=X.index.tolist(),data=predictY.tolist()) # 城市增长预测 incr_ccom = result[result=='1'].index return incr_ccom if __name__ == '__main__': # ##文件路径 data_path = 'C:\\Users\\90539\\PycharmProjects\\data\\' file_name = 'data.xlsx' typeid = ['B'] obj2 = Cluster() data, data1 = obj2.data_read(data_path,file_name, typeid) # data11 = obj2.data_group_byType(data1) data2 = obj2.outlier_filtrate(data1, method='threshold', fill='nan', threshold=1) data3 = obj2.data_scale(data2, method='minmax') data4 = obj2.data_factor(data3, replace='dependent', threshold=0.1) centers = obj2.kmeans_cluster(data4)