第一步:给训练集打标签(故障码):
由于是SPN,FMI俩个特征对应一个故障码,故采用下面语句来给训练集加标签(CODE)dataset为训练集,data_faultcode为故障码说明集;合并得到dataset_a;
dataset_a = pd.merge(dataset,data_faultcode, how='left', on=["SPN", "FMI"])
dataset_a['CODE']=dataset_a['CODE'].fillna(0)
dataset_a.to_csv(path_fault_dataset,index=False)
第二步:结合题目和测试集来提取训练集中有用的特征形成新的数据集:
dataset_a = pd.read_csv(path_fault_dataset,low_memory=False,encoding='GBK')
cols=['ACCRUNTIMES','AREA','CITY','COLLECTTIME','FAULTTIME','FRID','LAT','LNG','PROVINCE','READFLAG','UNIQUENO','CODE']
dataset_b=dataset_a[cols]
dataset_b.dropna(axis=0, how='any', inplace=True)
dataset_b.to_csv(path_dataset_b,index=False)
第三步:对新数据集以及测试集的时间特征进行时间转时间戳操作:
for i in range(0, len(dataset_b)):
timeArray = time.strptime(dataset_b.iloc[i]['COLLECTTIME'], "%Y-%m-%d %H:%M:%S")
dataset_b.loc[i, 'COLLECTTIME'] = int(time.mktime(timeArray))
# timeArray = time.strptime(dataset_b.iloc[i]['FAULTTIME'], "%Y-%m-%d %H:%M:%S")
# dataset_b.loc[i,'FAULTTIME'] = int(time.mktime(timeArray))
dataset_b[['COLLECTTIME']] = dataset_b[['COLLECTTIME']].apply(pd.to_numeric)
#dataset_b[['COLLECTTIME', 'FAULTTIME']] = dataset_b[['COLLECTTIME', 'FAULTTIME']].apply(pd.to_numeric)
dataset_b.to_csv(path_dataset_c,index=False)
for i in range(0, len(dataset_test)):
timeArray = time.strptime(dataset_test.iloc[i]['COLLECTTIME'], "%Y-%m-%d %H:%M:%S")
dataset_test.loc[i, 'COLLECTTIME'] = int(time.mktime(timeArray))
dataset_test[['COLLECTTIME']] = dataset_test[['COLLECTTIME']].apply(pd.to_numeric)
第三步:将训练集的特征和标签分开:
X=dataset_c.drop(['CODE'],axis=1,inplace=False)
Y=dataset_c[['CODE']]
对特征部分进行数据标准化操作:
scaler = preprocessing.StandardScaler().partial_fit(X)
dataset_d_X =pd.DataFrame(scaler.transform(X))
dataset_d_Y =Y
测试集也进行同样标准化操作:
test_X=dataset_test[['ACCRUNTIMES','AREA','CITY','COLLECTTIME','FRID','LAT','LNG','PROVINCE','READFLAG']]
dataset_test_X =pd.DataFrame(scaler.transform(test_X))
第四步:用朴素随机过采样处理训练集中的数据不平衡问题:
ros = RandomOverSampler(random_state=0)
train_X, train_Y= ros.fit_sample(dataset_d_X,np.array(dataset_d_Y).ravel())
X=pd.DataFrame(train_X)
Y=pd.DataFrame(train_Y)
X.to_csv(path_train_X,index=False)
Y.to_csv(path_train_Y,index=False)
第五步:准备好训练集X和Y以及测试集X,其中训练集Y有多个故障码,为方便模型训练以及最后的结果呈现,我们需要将标签先变为1,2,3,4,,,12。对于模型采用随机森林进行分类:
train_X=pd.read_csv(path_train_X)
train_Y=pd.read_csv(path_train_Y)
train_Y= train_Y.astype(int)
y=train_Y.replace([0,1122,1141,1145,1168,1182,1206,1209,1211,1215,1216,1239,1246,1341],[0,1,2,3,4,5,6,7,8,9,10,11,12,13])
test_X=pd.read_csv(path_dataset_test_X)
model = RandomForestRegressor(n_estimators=100, oob_score=True, n_jobs=-1, random_state=50,
max_features="auto", min_samples_leaf=50)
model.fit(train_X, y)
Pred=pd.DataFrame(model.predict(test_X))
Pred.to_csv(path_pred,index=False)
第六步:将测试集中预测完的结果进行四舍五入取整,同时再把取整后的标签转化为我们需要呈现的故障码:
B=pd.read_csv(path_pred)
df_1=B.round(0).astype(int).replace([0,1,2,3,4,5,6,7,8,9,10,11,12,13],[0,1122,1141,1145,1168,1182,1206,1209,1211,1215,1216,1239,1246,1341])
为呈现最后提交的格式,把测试集序号和预测结果合并到一个csv中,对列名进行修改:
result= pd.merge(df_0,df_1,left_index=True,right_index=True,how='outer')
# result.set_axis(['idx','Pred'], axis='columns', inplace=False)
result=result.rename(index=str, columns={"idx": "idx", "0": "Pred"})
result.to_csv(path_result,index=False)