羊老羊

三月：心跳信号分类预测_baseline_v1：输出结果优化

文章目录

- 1. 工具包导入
- 2. 数据导入
- 3. 数据预处理
- 5. 模型训练
- - 5.1 评估函数
  - 5.2 模型参数设置
- 5.3 输出
baseline_v1_outputInt:score:502.5345

# 2021.05.02
# v1版：对输出结果进行优化，概率》0.9输出为1
# 本文原创 望赞鼓励

仍存疑问：
lightgbm参数，运行机制没时间看

Datawhile三月选题：心跳信号分类：材料网址

1. 工具包导入

import os
import gc
import math

import pandas as pd
import numpy as np

import lightgbm as lgb
#import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler


from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

2. 数据导入

train = pd.read_csv('train.csv')
test=pd.read_csv('testA.csv')
train.head()

	id	heartbeat_signals	label
0	0	0.9912297987616655,0.9435330436439665,0.764677...	0.0
1	1	0.9714822034884503,0.9289687459588268,0.572932...	0.0
2	2	1.0,0.9591487564065292,0.7013782792997189,0.23...	2.0
3	3	0.9757952826275774,0.9340884687738161,0.659636...	0.0
4	4	0.0,0.055816398940721094,0.26129357194994196,0...	2.0

test.head()
train.info

了解到有3个columns: id , heartbeat_signals , label
并有10，000条训练数据

3. 数据预处理

压缩函数：提前准备好，有些竞赛内存有限制的时候用。


def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# 简单预处理
train_list = []

for items in train.values:
    train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])

train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + ['s_'+str(i) for i in range(len(train_list[0])-2)] + ['label']
train = reduce_mem_usage(train)

test_list=[]
for items in test.values:
    test_list.append([items[0]] + [float(i) for i in items[1].split(',')])

test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + ['s_'+str(i) for i in range(len(test_list[0])-1)]
test = reduce_mem_usage(test)

Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%

。。。。。。。。。。。。。。代码解析部分。。。。。。。。。。。

杨鸽分析 - 1: .values：值

for items in train.values:
  train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])

我们看一下train.values的内容

计算机并分不清里面的意义每一个id是个array,所以读取了[[0 行values],[1 行values],…[9999 行values]]

从[0 行values]所分析内容可以知道有3个栏目：[items[0]]就是value【0】也就是id的0，[items[2]是最后的0.0，中间项[items[1]，每个，都用.split分开

.
https://www.runoob.com/python/att-dictionary-items.html

[0 行values] 内容如下：

[0 ‘0.9912297987616655,0.9435330436439665,0.7646772997256593,0.6185708990212999,0.3796321642826237,0.19082233510621885,0.040237131594430715,0.02599520771717858,0.03170886048677242,0.06552357497104398,0.12553088566683082,0.14674736762087337,0.16765635354203254,0.19337353075154495,0.22613482558418235,0.2211427948707646,0.23606736350657742,0.2211427948707646,0.2211427948707646,0.21110661221417562,0.20858662883955462,0.19337353075154495,0.19592021355822875,0.1984624088145674,0.18570638844539308,0.19592020417425474,0.18314160533045887,0.19337353075154495,0.19082233510621885,0.20858662883955462,0.2211427948707646,0.2508391000672623,0.2606035735248363,0.27753397418529446,0.2942679945470305,0.3037438606924122,0.3364276621747203,0.3479233126336631,0.38410562561692113,0.3863371817756788,0.4084648300418338,0.4106590521686313,0.42592580507675887,0.42592580507675887,0.4291763526701312,0.4324195928902589,0.42809365036122277,0.42809365036122277,0.4128499421288813,0.41503751002775946,0.39744344526503395,0.40186210145863127,0.38856529150062463,0.4040663811296295,0.3952290154247498,0.39854937055983863,0.40186210145863127,0.41722173372658117,0.43457767776945677,0.4302582430505069,0.4324195928902589,0.41503751002775946,0.43457767776945677,0.42592580507675887,0.39744344526503395,0.3570543268368737,0.3456314920490284,0.334117492319521,0.3178419565867778,0.3201783146071371,0.3178419565867778,0.334117492319521,0.3248396760909577,0.334117492319521,0.3248396760909577,0.3364276621747203,0.32716473826850506,0.331803634364786,0.3364276621747203,0.32716473826850506,0.334117492319521,0.334117492319521,0.3456314920490284,0.34333602495581456,0.35477697711039125,0.334117492319521,0.334117492319521,0.4062672980405093,0.4957985125670452,0.5800967406106492,0.6925715000677044,0.8284346202194008,0.9465607483396898,1.0,0.9779737148370964,0.8528463111218912,0.6925715000677044,0.518357117251287,0.26546113853132874,0.18057224500953173,0.09037444062714899,0.0,0.011611290892655711,0.014499569681678932,0.07662128178823865,0.11749356439958623,0.15462379539892293,0.18314160533045887,0.21613343067782795,0.20858662883955462,0.21236496446001718,0.21613343067782795,0.20858662883955462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0’
0.0]

train = pd.DataFrame(np.array(train_list))

杨鸽分析 - 2: pd.DataFrame()函数解析

DataFrame是Python中Pandas库中的一种数据结构，它类似excel，是一种二维表

https://blog.csdn.net/tefuirnever/article/details/93708964

train.head()
train.info

test.head()
test.info

。。。。。。。。。。。。。。解析结束。。。。。。。。。。。

#删除表中的某一行或者某一列更明智的方法是使用drop，它不改变原有的df中的数据，而是返回另一个dataframe来存放删除后的数据。
#就是新开一个表 
#drop函数默认删除行，列需要加axis = 1
x_train = train.drop(['id','label'], axis=1)
y_train = train['label']
x_test=test.drop(['id'], axis=1)

x_train.info

x_test.info

y_train

0        0.0
1        0.0
2        2.0
3        0.0
4        2.0
        ... 
99995    0.0
99996    2.0
99997    3.0
99998    2.0
99999    0.0
Name: label, Length: 100000, dtype: float16

5. 模型训练

5.1 评估函数

评测公式（损失函数）： ${abs-sum={\mathop{ \sum }\limits_{{j=1}}^{{n}}{{\mathop{ \sum }\limits_{{i=1}}^{{4}}{{ \left| {y\mathop{{}}\nolimits_{{i}}-a\mathop{{}}\nolimits_{{i}}} \right| }}}}}}$

例如，某心跳信号类别为1，通过编码转成[0,1,0,0]，预测不同心跳信号概率为[0.1,0.7,0.1,0.1]，那么这个信号预测结果的abs-sum为 $\left| {0.1-0} \right| }+{ \left| {0.7-1} \right| }+{ \left| {0.1-0} \right| }+{ \left| {0.1-0} \right| }=0.6}$

def abs_sum(y_pre,y_tru):
    y_pre=np.array(y_pre)
    y_tru=np.array(y_tru)
    loss=sum(sum(abs(y_pre-y_tru)))
    return loss

test = np.zeros((x_test.shape[0],4))
print(test)
test

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 ...
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]





array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

5.2 模型参数设置

def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2021
    
    #k-交叉验证KFold
    # n_split:要划分的折数
    #shuffle: 每次都进行shuffle，测试集中折数的总和就是训练集的个数
    #random_state:随机状态 总结：对于那些本质上是随机的过程，我们有必要控制随机的状态，这样才能重复的展现相同的结果。
    #如果，对随机状态不加控制，那么实验的结果就无法固定，而是随机的显现。比喻的说一下，也不知道准不准确。
    # 一个容器中放置一定量的沙子，每次用手去抓沙子的时候，抓取的结果会受到抓取的力度、一只手抓还是两只手抓、手是干的或湿的等诸多因素的影响（将影响因素定为A={a,b,c,d,e,f,……}）。
    #固定random_state后，每次抓取沙子时的影响因素将被固定为具体的某一组，这样每次抓取的沙子就具有了相同的状态。

    
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    test = np.zeros((test_x.shape[0],4))

    cv_scores = []
    
    #sparse=True 表示编码的格式，默认为 True，即为稀疏的格式，指定 False 则就不用 toarray() 了
    onehot_encoder = OneHotEncoder(sparse=False)
    
    #然后train_index, valid_index取得是数字的索引号，类似于指针。
    # kf.split(train_x, train_y)就是两个同时取 train_x取1-4做测试集 那么train_y同事也是1-4做测试集
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        
        print('************************************ {} ************************************'.format(str(i+1)))
        
        # .iloc 提取的某一行，i取值就是【1，folds】，
        # 其实就是去了四块内容，训练集(trn_)的x 训练集的y 测试集(valid_)x 测试集y
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'num_class': 4,
                'num_leaves': 2 ** 5,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': seed,
                'nthread': 28,
                'n_jobs':24,
                'verbose': -1,
            }

            model = clf.train(params, 
                      train_set=train_matrix, 
                      valid_sets=valid_matrix, 
                      num_boost_round=2000, 
                      verbose_eval=100, 
                      early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration) 
            
        #.reshape(-1, 1) 就是我不知道变成多少列（-1），但每行只有一个（1）
        val_y=np.array(val_y).reshape(-1, 1)
        print("val_y++++++:")
        print(val_y)
        val_y = onehot_encoder.fit_transform(val_y)
        print("val_y++++++:")
        print(val_y)
        print("val_y++++++:")
        print('预测的概率矩阵为：')
        print(test_pred)
        test += test_pred
        score=abs_sum(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    #返回数组元素的标准差
    
    #  a = np.array([[1, 2], [3, 4]])
    #  print(np.std(a))            # 计算全局标准差 1.118033988749895
    #  print(np.std(a, axis=0))    # axis=0计算每一列的标准差 [1. 1.]  平均值2，3 ：(（1-2）^ +(3-2)^)/2=第一列标准差^2
    #  print(np.std(a, axis=1))    # 计算每一行的标准差 [0.5 0.5]
    # 

    print("%s_score_std:" % clf_name, np.std(cv_scores))
    test=test/kf.n_splits

    return test

def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test
lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0525735
[200]	valid_0's multi_logloss: 0.0422444
[300]	valid_0's multi_logloss: 0.0407076
[400]	valid_0's multi_logloss: 0.0420398
Early stopping, best iteration is:
[289]	valid_0's multi_logloss: 0.0405457
val_y++++++:
[[2.]
 [0.]
 [2.]
 ...
 [0.]
 [2.]
 [2.]]
val_y++++++:
[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
val_y++++++:
预测的概率矩阵为：
[[9.99969791e-01 2.85197261e-05 1.00341946e-06 6.85357631e-07]
 [7.93287264e-05 7.69060914e-04 9.99151590e-01 2.00810971e-08]
 [5.75356884e-07 5.04051497e-08 3.15322414e-07 9.99999059e-01]
 ...
 [6.79267940e-02 4.30206297e-04 9.31640185e-01 2.81516302e-06]
 [9.99960477e-01 3.94098074e-05 8.34030725e-08 2.94638661e-08]
 [9.88705846e-01 2.14081630e-03 6.67418381e-03 2.47915423e-03]]
[607.0736049372185]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0566626
[200]	valid_0's multi_logloss: 0.0450852
[300]	valid_0's multi_logloss: 0.044078
[400]	valid_0's multi_logloss: 0.0455546
Early stopping, best iteration is:
[275]	valid_0's multi_logloss: 0.0437793
val_y++++++:
[[2.]
 [3.]
 [3.]
 ...
 [0.]
 [0.]
 [0.]]
val_y++++++:
[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
val_y++++++:
预测的概率矩阵为：
[[9.99991401e-01 7.69109547e-06 6.65504756e-07 2.42084688e-07]
 [5.72380482e-05 1.32812809e-03 9.98614607e-01 2.66534396e-08]
 [2.82123411e-06 4.13195205e-07 1.34026965e-06 9.99995425e-01]
 ...
 [6.96398024e-02 6.52459907e-04 9.29685742e-01 2.19960932e-05]
 [9.99972366e-01 2.75069005e-05 7.68142933e-08 5.07415018e-08]
 [9.67263676e-01 7.26154408e-03 2.41533542e-02 1.32142531e-03]]
[607.0736049372185, 623.4313863731124]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0498722
[200]	valid_0's multi_logloss: 0.038028
[300]	valid_0's multi_logloss: 0.0358066
[400]	valid_0's multi_logloss: 0.0361478
[500]	valid_0's multi_logloss: 0.0379597
Early stopping, best iteration is:
[340]	valid_0's multi_logloss: 0.0354344
val_y++++++:
[[0.]
 [2.]
 [0.]
 ...
 [2.]
 [3.]
 [0.]]
val_y++++++:
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
val_y++++++:
预测的概率矩阵为：
[[9.99972032e-01 2.62406774e-05 1.17282152e-06 5.54230651e-07]
 [1.05242811e-05 6.50215805e-05 9.99924453e-01 6.93812546e-10]
 [1.93240868e-06 1.10384984e-07 3.76773426e-07 9.99997580e-01]
 ...
 [1.34894410e-02 3.84569683e-05 9.86471555e-01 5.46564350e-07]
 [9.99987431e-01 1.25532882e-05 1.03902298e-08 5.46727770e-09]
 [9.78722948e-01 1.06329839e-02 6.94192038e-03 3.70214810e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0564768
[200]	valid_0's multi_logloss: 0.0448698
[300]	valid_0's multi_logloss: 0.0446719
[400]	valid_0's multi_logloss: 0.0470399
Early stopping, best iteration is:
[250]	valid_0's multi_logloss: 0.0438853
val_y++++++:
[[0.]
 [0.]
 [0.]
 ...
 [3.]
 [0.]
 [1.]]
val_y++++++:
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]
val_y++++++:
预测的概率矩阵为：
[[9.99979692e-01 1.70821979e-05 1.27048476e-06 1.95571841e-06]
 [5.66207785e-05 4.02275314e-04 9.99541086e-01 1.82828519e-08]
 [2.62267451e-06 3.58613522e-07 4.78645006e-06 9.99992232e-01]
 ...
 [4.56636552e-02 5.69497433e-04 9.53758468e-01 8.37980573e-06]
 [9.99896785e-01 1.02796802e-04 2.46636563e-07 1.72061021e-07]
 [8.70911669e-01 1.73790185e-02 1.04478175e-01 7.23113697e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0506398
[200]	valid_0's multi_logloss: 0.0396422
[300]	valid_0's multi_logloss: 0.0381065
[400]	valid_0's multi_logloss: 0.0390162
[500]	valid_0's multi_logloss: 0.0414986
Early stopping, best iteration is:
[324]	valid_0's multi_logloss: 0.0379497
val_y++++++:
[[2.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
val_y++++++:
[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
val_y++++++:
预测的概率矩阵为：
[[9.99993352e-01 6.02902202e-06 1.13002685e-07 5.06277302e-07]
 [1.03959552e-05 5.03778956e-04 9.99485820e-01 5.07638601e-09]
 [1.92568065e-07 5.07155306e-08 4.94690856e-08 9.99999707e-01]
 ...
 [8.83103121e-03 2.51969353e-05 9.91142776e-01 9.96143937e-07]
 [9.99984791e-01 1.51997858e-05 5.62426491e-09 3.80450197e-09]
 [9.86084001e-01 8.75968498e-04 1.09742304e-02 2.06580027e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266, 539.2160054696064]
lgb_scotrainre_list: [607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266, 539.2160054696064]
lgb_score_mean: 587.6463107214719
lgb_score_std: 55.944536405714565

5.3 输出

temp=pd.DataFrame(lgb_test)
result=pd.read_csv('sample_submit.csv')
result['label_0']=temp[0]
result['label_1']=temp[1]
result['label_2']=temp[2]
result['label_3']=temp[3]
result.to_csv('submit_baseline.csv',index=False)

submit_data=pd.read_csv('submit_baseline.csv')

submit_data

	id	label_0	label_1	label_2	label_3
0	100000	0.999981	1.711254e-05	8.450466e-07	7.887337e-07
1	100001	0.000043	6.136530e-04	9.993435e-01	1.415752e-08
2	100002	0.000002	1.966629e-07	1.373657e-06	9.999968e-01
3	100003	0.999970	1.909713e-05	1.097002e-05	3.576703e-08
4	100004	0.999983	1.769712e-06	1.482817e-05	1.966254e-07
...	...	...	...	...	...
19995	119995	0.998096	3.060176e-04	1.085313e-04	1.489757e-03
19996	119996	0.999846	1.436305e-04	1.074898e-05	8.837766e-08
19997	119997	0.041110	3.431635e-04	9.585397e-01	6.946754e-06
19998	119998	0.999960	3.949332e-05	8.457368e-08	5.230763e-08
19999	119999	0.958338	7.658066e-03	3.064437e-02	3.359933e-03

20000 rows × 5 columns

submit_data.to_csv('submit_baseline_v1.csv',index=False)

baseline_v1_outputInt:score:502.5345

for index,row in submit_data.iterrows():
    row_max = max(list(row)[1:])
    if row_max > 0.9:
        for i in range(1,5):
            if row[i]>0.9:
                submit_data.iloc[index,i] = 1
            else:
                submit_data.iloc[index,i] = 0
submit_data

	id	label_0	label_1	label_2	label_3
0	100000	1.0	0.0	0.0	0.0
1	100001	0.0	0.0	1.0	0.0
2	100002	0.0	0.0	0.0	1.0
3	100003	1.0	0.0	0.0	0.0
4	100004	1.0	0.0	0.0	0.0
...	...	...	...	...	...
19995	119995	1.0	0.0	0.0	0.0
19996	119996	1.0	0.0	0.0	0.0
19997	119997	0.0	0.0	1.0	0.0
19998	119998	1.0	0.0	0.0	0.0
19999	119999	1.0	0.0	0.0	0.0