#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
bagging try
'''
import pandas as pd
import numpy as np
from numpy import NaN
from dateutil.parser import parse
'''
预处理数据
'''
# dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')
# data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-new0620.csv',
# dtype={'"EQUIP_ID"': object, 'FAULT_TYPE': object, 'INST_DATE':object, 'DETECT_DATE':object,
# 'FAULT_DATE':object, 'SYNC_ORG_NO': object, 'ORG_NO': object, 'ORG_NAME': object, 'SORT_CODE':object,
# 'SPEC_CODE': object, 'COMM_MODE': object, 'ARRIVE_BATCH_NO': object, 'MANUFACTURER': object },
# date_parser=dateparse)
#
# data.drop('ORG_NAME', axis=1, inplace=True)
# # 去重去空
# data = data.drop_duplicates()
# data.dropna(how='all')
# print data.info()
# '''
# 1.时间处理
# '''
# # 时间处理
# data['FAULT_DATE1'] = pd.to_datetime(data['FAULT_DATE'].str.strip().str.split(' ').str[0])
# data['INST_DATE1'] = pd.to_datetime(data['INST_DATE'].str.strip().str.split(' ').str[0])
# data['DETECT_DATE1'] = pd.to_datetime(data['DETECT_DATE'].str.strip().str.split(' ').str[0], errors='coerce')
# # 提取月份
# data['FAULT_MONTH'] = [x.month for x in data['FAULT_DATE1']]
# data['INST_MONTH'] = [x.month for x in data['INST_DATE1']]
# data['DETECT_MONTH'] = [x.month for x in data['DETECT_DATE1']]
# # 计算使用时间
# print sum(data['FAULT_DATE1'].isnull()), sum(data['INST_DATE1'].isnull())
# data['work_days'] = data['FAULT_DATE1'] - data['INST_DATE1']
# data['work_months'] = [x.days / 30 if not pd.isnull(x) else np.nan for x in data['work_days']]
# # 计算库存时间
# data['save_days'] = data['INST_DATE1'] - data['DETECT_DATE1']
# data['save_months'] = [x.days / 30 if not pd.isnull(x) else np.nan for x in data['save_days']]
# '''
# 2.统计分析/处理/筛选数据
# '''
# import seaborn as sns
# import matplotlib.pyplot as plt
# # 2.1 FAULT_TYPE
# print data['FAULT_TYPE'].isnull().sum()
# print data['FAULT_TYPE'].describe()
# print data['FAULT_TYPE'].value_counts()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='FAULT_TYPE', data=data, ax=axis0)
# # 选出401-411故障
# data['FAULT_TYPE'] = data['FAULT_TYPE'].str.strip()
# data['FAULT_TYPE_2'] = [x[0:2] for x in data['FAULT_TYPE'].values.astype('str')]
# data['FAULT_TYPE_4'] = [x[0:4] for x in data['FAULT_TYPE'].values.astype('str')]
# data = data[data['FAULT_TYPE_2'] == '04']
# data = data[(data['FAULT_TYPE_4'] != '0412') & (data['FAULT_TYPE'] != '04')]
# print data['FAULT_TYPE'].value_counts()
# print data['FAULT_TYPE_4'].value_counts()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='FAULT_TYPE_4', data=data, ax=axis0)
# # 2.2 SORT_CODE
# print data['SORT_CODE'].isnull().sum()
# print data['SORT_CODE'].describe()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='SORT_CODE', data=data, ax=axis0)
# # 选出SORT_CODE 为10 的智能表
# data['SORT_CODE'] = data['SORT_CODE'].str.strip()
# data = data[data['SORT_CODE'] == '10']
# # 2.3 SPEC_CODE——待定
# print data['SPEC_CODE'].isnull().sum()
# print data['SPEC_CODE'].describe()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='SPEC_CODE', data=data, ax=axis0)
# # 2.4 COMM_MODE——待定
# print data['COMM_MODE'].isnull().sum()
# print data['COMM_MODE'].describe()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='COMM_MODE', data=data, ax=axis0)
# # 2.5 ORG_NO
# print data['ORG_NO'].isnull().sum()
# print data['ORG_NO'].describe()
# data['ORG_NO1'] = [x[:5] for x in data['ORG_NO'].values.astype('str')]
# data['ORG_NO1'].value_counts()
# # 2.6 ARRIVE_BATCH_NO——待定
# print data['ARRIVE_BATCH_NO'].isnull().sum()
# data['ARRIVE_BATCH_NO'].value_counts()
# # 2.7 MANUFACTURER——待定
# print data['MANUFACTURER'].isnull().sum()
# data['MANUFACTURER'].value_counts()
# # 2.8 all
# print data.describe()
# '''
# 3.删除无用数据
# '''
# data.drop(['FAULT_DATE','SYNC_ORG_NO', 'INST_DATE', 'DETECT_DATE',
# 'work_days', 'save_days', 'FAULT_TYPE_2'],
# axis=1, inplace=True)
# '''
# 4.存储数据
# '''
# data.to_csv(r'E:\7 Python\data\20170616\zhejiang-new0620-reprocess.csv', index=False)
'''
建模前预处理:删除多余属性,补全时间缺失或负值
'''
# # 1.删除多余属性
# data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-new0620-reprocess.csv')
# data = data.drop_duplicates() # 去重
# data.dropna() # 去空
# print data.info()
# print data['COMM_MODE'].value_counts()
# data.drop(['"EQUIP_ID"', 'FAULT_TYPE', 'ORG_NO', 'SORT_CODE', 'COMM_MODE',
# 'ARRIVE_BATCH_NO', 'MANUFACTURER', 'FAULT_DATE1',
# 'INST_DATE1', 'DETECT_DATE1', 'INST_MONTH',
# 'DETECT_MONTH', 'FAULT_DATE1', 'FAULT_DATE1'],axis=1, inplace=True)
# print data.info()
# # 2.补全缺失或负值
# # work_months
# # 统计小于零的值
# print len(data['work_months'][data['work_months']<0])
# # 将小于零的值化为na
# data['work_months'][data['work_months'] < 0] = NaN
# # 再次统计小于零的值,确保无负值
# print len(data['work_months'][data['work_months']<0])
# # 统计空值
# count_nan_work_months = data['work_months'].isnull().sum()
# print count_nan_work_months
# # 统计均值和方差
# work_months_mean = data['work_months'].mean()
# work_months_std = data['work_months'].std()
# # 补全,生成标准正态分布,大小为空值的大小
# rand_1 = np.random.randint(work_months_mean - work_months_std, work_months_mean + work_months_std, size = count_nan_work_months)
# data['work_months'][data['work_months'].isnull()] = rand_1
# # save_months
# print len(data['save_months'][data['save_months']<0])
# data['save_months'][data['save_months'] < 0] = NaN
# print len(data['save_months'][data['save_months']<0])
# count_nan_save_months = data['save_months'].isnull().sum()
# print count_nan_save_months
# save_months_mean = data['save_months'].mean()
# save_months_std = data['save_months'].std()
# rand_2 = np.random.randint(save_months_std - save_months_mean, save_months_mean + save_months_std, size = count_nan_save_months)
# data['save_months'][data['save_months'].isnull()] = rand_2
# # 检查
# print len(data['work_months'][data['work_months']<0])
# print len(data['save_months'][data['save_months']<0])
# print data.isnull().sum().sum()
# # 3.保存数据
# data.to_csv(r'E:\7 Python\data\20170616\zhejiang-bagging-data.csv', index=False)
'''
bagging
'''
data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-bagging-data.csv')
print data.info()
data_X = data.drop(['FAULT_TYPE_4'], axis=1)
data_y = data['FAULT_TYPE_4']
'''
变量转化
'''
"category 变量处理:转化成string,one-hot编码"
# FAULT_TYPE_4
print data['FAULT_TYPE_4'].dtypes
data['FAULT_TYPE_4'] = data['FAULT_TYPE_4'].astype(str)
print data['FAULT_TYPE_4'].dtypes
print data['FAULT_TYPE_4'].value_counts()
print pd.get_dummies(data['FAULT_TYPE_4'],prefix = 'FAULT_TYPE_4').head()
# SPEC_CODE
print data['SPEC_CODE'].dtypes
data['SPEC_CODE'] = data['SPEC_CODE'].astype(str)
print data['SPEC_CODE'].dtypes
print data['SPEC_CODE'].value_counts()
print pd.get_dummies(data['SPEC_CODE'],prefix = 'SPEC_CODE').head()
# FAULT_MONTH
print data['FAULT_MONTH'].dtypes
data['FAULT_MONTH'] = data['FAULT_MONTH'].astype(str)
print data['FAULT_MONTH'].dtypes
print data['FAULT_MONTH'].value_counts()
print pd.get_dummies(data['FAULT_MONTH'],prefix = 'FAULT_MONTH').head()
# ORG_NO1
print data['ORG_NO1'].dtypes
data['ORG_NO1'] = data['ORG_NO1'].astype(str)
print data['ORG_NO1'].dtypes
print data['ORG_NO1'].value_counts()
print pd.get_dummies(data['ORG_NO1'],prefix = 'ORG_NO1').head()
# SPEC_CODE
print data['SPEC_CODE'].dtypes
data['SPEC_CODE'] = data['SPEC_CODE'].astype(str)
print data['SPEC_CODE'].dtypes
print data['SPEC_CODE'].value_counts()
print pd.get_dummies(data['SPEC_CODE'],prefix = 'SPEC_CODE').head()
# one-hot coding
data_dummy = pd.get_dummies(data)
print data_dummy.head()
"numerical变量(work_months和save_months)处理:处理缺失值,0/1数据标准化"
# 缺失值处理
print data_dummy.isnull().sum()
# print data_dummy['save_months'].isnull().sum()
# save_months_mean = data_dummy['save_months'].mean()
# data_dummy['save_months']= data_dummy['save_months'].fillna(save_months_mean)
# print data_dummy.isnull().sum().sum()
# 0/1数据标准化
numeric_cols = data.columns[data.dtypes != 'object']
print numeric_cols
numeric_col_means = data_dummy.loc[:,numeric_cols].mean()
numeric_col_std = data_dummy.loc[:,numeric_cols].std()
data_dummy.loc[:,numeric_cols] = (data_dummy.loc[:,numeric_cols] - numeric_col_means) / numeric_col_std
'''
建立模型
'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
# 数据分集
train, test, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=27)
#xgboost
import xgboost as xgb
# dtrain = xgb.DMatrix(train,train_y)
# dtest = xgb.DMatrix(test)
xgb_clf = xgb.XGBClassifier(
learning_rate = 0.2,
n_estimators = 720,
max_depth = 9,
colsample_bytree = 0.8,
subsample = 0.9,
objective = 'multi:softprob',
min_child_weight = 1,
gamma = 2,
seed = 27 )
param = xgb_clf.get_xgb_params()
param['num_class'] = 11
xgb_clf.fit(train,train_y, eval_metric='merror')
xgb_pred = xgb_clf.predict(test)
print(classification_report(test_y, xgb_pred))
print(confusion_matrix(test_y, xgb_pred))
# # DecisionTree
# from sklearn.tree import DecisionTreeClassifier
# dt_clf = DecisionTreeClassifier()
# dt_clf.fit(train,train_y)
# print(dt_clf)
# dt_clf_pred = dt_clf.predict(test)
# print(classification_report(test_y, dt_clf_pred))
# print(confusion_matrix(test_y, dt_clf_pred))
# # knn
# from sklearn.neighbors import KNeighborsClassifier
# knn_clf = KNeighborsClassifier(n_neighbors=25)
# knn_clf.fit(train, train_y)
# knn_pred = knn_clf.predict(test)
# print knn_pred
# knn_pred_proba=knn_clf.predict_proba(test)
# print knn_pred_proba
# # print model report:
# # print knn_clf.score(test, test_y)
# print(classification_report(test_y, knn_pred))
# print(confusion_matrix(test_y, knn_pred))
# # bagging
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# dt_clf = DecisionTreeClassifier()
# bagging_clf = BaggingClassifier(base_estimator=dt_clf,
# n_estimators=10,
# max_samples=1.0,
# max_features=1.0,
# bootstrap=True )
# bagging_clf.fit(train,train_y)
# bagging_pred = bagging_clf.predict(test)
# print bagging_pred
# print(classification_report(test_y, bagging_pred))
# print(confusion_matrix(test_y, bagging_pred))
# voting
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
dt_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier(n_neighbors=25)
xgb_clf = xgb.XGBClassifier(learning_rate = 0.2,n_estimators = 720,max_depth = 9,colsample_bytree = 0.8,subsample = 0.9,
objective = 'multi:softprob',min_child_weight = 1,gamma = 2,seed = 27)
voting_clf=VotingClassifier(estimators=[('dt_clf', dt_clf),('knn_clf', knn_clf),('xgb_clf',xgb_clf)])
voting_clf.fit(train,train_y)
votingg_pred = voting_clf.predict(test)
print votingg_pred
print(classification_report(test_y, votingg_pred))
print(confusion_matrix(test_y, votingg_pred))