# 2021.05.02
# v1版:对输出结果进行优化,概率》0.9输出为1
# 本文原创 望赞鼓励
仍存疑问:
lightgbm参数,运行机制没时间看
Datawhile三月选题:心跳信号分类:材料网址
import os
import gc
import math
import pandas as pd
import numpy as np
import lightgbm as lgb
#import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('train.csv')
test=pd.read_csv('testA.csv')
train.head()
id | heartbeat_signals | label | |
---|---|---|---|
0 | 0 | 0.9912297987616655,0.9435330436439665,0.764677... | 0.0 |
1 | 1 | 0.9714822034884503,0.9289687459588268,0.572932... | 0.0 |
2 | 2 | 1.0,0.9591487564065292,0.7013782792997189,0.23... | 2.0 |
3 | 3 | 0.9757952826275774,0.9340884687738161,0.659636... | 0.0 |
4 | 4 | 0.0,0.055816398940721094,0.26129357194994196,0... | 2.0 |
test.head()
train.info
了解到有3个columns: id , heartbeat_signals , label
并有10,000条训练数据
压缩函数:提前准备好 ,有些竞赛内存有限制的时候用。
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
# 简单预处理
train_list = []
for items in train.values:
train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])
train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + ['s_'+str(i) for i in range(len(train_list[0])-2)] + ['label']
train = reduce_mem_usage(train)
test_list=[]
for items in test.values:
test_list.append([items[0]] + [float(i) for i in items[1].split(',')])
test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + ['s_'+str(i) for i in range(len(test_list[0])-1)]
test = reduce_mem_usage(test)
Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%
。。。。。。。。。。。。。。代码解析部分。。。。。。。。。。。
for items in train.values:
train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])
- 我们看一下train.values的内容
- 计算机并分不清里面的意义 每一个id是个array,所以读取了[[0 行values],[1 行values],…[9999 行values]]
- 从[0 行values]所分析内容可以知道有3个栏目:[items[0]]就是value【0】也就是id的0,[items[2]是最后的0.0,中间项[items[1],每个,都用.split分开
.
https://www.runoob.com/python/att-dictionary-items.html
[0 ‘0.9912297987616655,0.9435330436439665,0.7646772997256593,0.6185708990212999,0.3796321642826237,0.19082233510621885,0.040237131594430715,0.02599520771717858,0.03170886048677242,0.06552357497104398,0.12553088566683082,0.14674736762087337,0.16765635354203254,0.19337353075154495,0.22613482558418235,0.2211427948707646,0.23606736350657742,0.2211427948707646,0.2211427948707646,0.21110661221417562,0.20858662883955462,0.19337353075154495,0.19592021355822875,0.1984624088145674,0.18570638844539308,0.19592020417425474,0.18314160533045887,0.19337353075154495,0.19082233510621885,0.20858662883955462,0.2211427948707646,0.2508391000672623,0.2606035735248363,0.27753397418529446,0.2942679945470305,0.3037438606924122,0.3364276621747203,0.3479233126336631,0.38410562561692113,0.3863371817756788,0.4084648300418338,0.4106590521686313,0.42592580507675887,0.42592580507675887,0.4291763526701312,0.4324195928902589,0.42809365036122277,0.42809365036122277,0.4128499421288813,0.41503751002775946,0.39744344526503395,0.40186210145863127,0.38856529150062463,0.4040663811296295,0.3952290154247498,0.39854937055983863,0.40186210145863127,0.41722173372658117,0.43457767776945677,0.4302582430505069,0.4324195928902589,0.41503751002775946,0.43457767776945677,0.42592580507675887,0.39744344526503395,0.3570543268368737,0.3456314920490284,0.334117492319521,0.3178419565867778,0.3201783146071371,0.3178419565867778,0.334117492319521,0.3248396760909577,0.334117492319521,0.3248396760909577,0.3364276621747203,0.32716473826850506,0.331803634364786,0.3364276621747203,0.32716473826850506,0.334117492319521,0.334117492319521,0.3456314920490284,0.34333602495581456,0.35477697711039125,0.334117492319521,0.334117492319521,0.4062672980405093,0.4957985125670452,0.5800967406106492,0.6925715000677044,0.8284346202194008,0.9465607483396898,1.0,0.9779737148370964,0.8528463111218912,0.6925715000677044,0.518357117251287,0.26546113853132874,0.18057224500953173,0.09037444062714899,0.0,0.011611290892655711,0.014499569681678932,0.07662128178823865,0.11749356439958623,0.15462379539892293,0.18314160533045887,0.21613343067782795,0.20858662883955462,0.21236496446001718,0.21613343067782795,0.20858662883955462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0’
0.0]
train = pd.DataFrame(np.array(train_list))
DataFrame是Python中Pandas库中的一种数据结构,它类似excel,是一种二维表
https://blog.csdn.net/tefuirnever/article/details/93708964
train.head()
train.info
test.head()
test.info
。。。。。。。。。。。。。。解析结束。。。。。。。。。。。
#删除表中的某一行或者某一列更明智的方法是使用drop,它不改变原有的df中的数据,而是返回另一个dataframe来存放删除后的数据。
#就是新开一个表
#drop函数默认删除行,列需要加axis = 1
x_train = train.drop(['id','label'], axis=1)
y_train = train['label']
x_test=test.drop(['id'], axis=1)
x_train.info
x_test.info
y_train
0 0.0
1 0.0
2 2.0
3 0.0
4 2.0
...
99995 0.0
99996 2.0
99997 3.0
99998 2.0
99999 0.0
Name: label, Length: 100000, dtype: float16
评测公式(损失函数): a b s − s u m = ∑ j = 1 n ∑ i = 1 4 ∣ y i − a i ∣ {abs-sum={\mathop{ \sum }\limits_{{j=1}}^{{n}}{{\mathop{ \sum }\limits_{{i=1}}^{{4}}{{ \left| {y\mathop{{}}\nolimits_{{i}}-a\mathop{{}}\nolimits_{{i}}} \right| }}}}}} abs−sum=j=1∑ni=1∑4∣yi−ai∣
例如,某心跳信号类别为1,通过编码转成[0,1,0,0],预测不同心跳信号概率为[0.1,0.7,0.1,0.1],那么这个信号预测结果的abs-sum为 a b s − s u m = ∣ 0.1 − 0 ∣ + ∣ 0.7 − 1 ∣ + ∣ 0.1 − 0 ∣ + ∣ 0.1 − 0 ∣ = 0.6 {abs-sum={ \left| {0.1-0} \right| }+{ \left| {0.7-1} \right| }+{ \left| {0.1-0} \right| }+{ \left| {0.1-0} \right| }=0.6} abs−sum=∣0.1−0∣+∣0.7−1∣+∣0.1−0∣+∣0.1−0∣=0.6
def abs_sum(y_pre,y_tru):
y_pre=np.array(y_pre)
y_tru=np.array(y_tru)
loss=sum(sum(abs(y_pre-y_tru)))
return loss
test = np.zeros((x_test.shape[0],4))
print(test)
test
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
...
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
...,
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2021
#k-交叉验证KFold
# n_split:要划分的折数
#shuffle: 每次都进行shuffle,测试集中折数的总和就是训练集的个数
#random_state:随机状态 总结:对于那些本质上是随机的过程,我们有必要控制随机的状态,这样才能重复的展现相同的结果。
#如果,对随机状态不加控制,那么实验的结果就无法固定,而是随机的显现。比喻的说一下,也不知道准不准确。
# 一个容器中放置一定量的沙子,每次用手去抓沙子的时候,抓取的结果会受到抓取的力度、一只手抓还是两只手抓、手是干的或湿的等诸多因素的影响(将影响因素定为A={a,b,c,d,e,f,……})。
#固定random_state后,每次抓取沙子时的影响因素将被固定为具体的某一组,这样每次抓取的沙子就具有了相同的状态。
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
test = np.zeros((test_x.shape[0],4))
cv_scores = []
#sparse=True 表示编码的格式,默认为 True,即为稀疏的格式,指定 False 则就不用 toarray() 了
onehot_encoder = OneHotEncoder(sparse=False)
#然后train_index, valid_index取得是数字的索引号,类似于指针。
# kf.split(train_x, train_y)就是两个同时取 train_x取1-4做测试集 那么train_y同事也是1-4做测试集
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
# .iloc 提取的某一行,i取值就是【1,folds】,
# 其实就是去了四块内容,训练集(trn_)的x 训练集的y 测试集(valid_)x 测试集y
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': 4,
'num_leaves': 2 ** 5,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': seed,
'nthread': 28,
'n_jobs':24,
'verbose': -1,
}
model = clf.train(params,
train_set=train_matrix,
valid_sets=valid_matrix,
num_boost_round=2000,
verbose_eval=100,
early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
#.reshape(-1, 1) 就是我不知道变成多少列(-1),但每行只有一个(1)
val_y=np.array(val_y).reshape(-1, 1)
print("val_y++++++:")
print(val_y)
val_y = onehot_encoder.fit_transform(val_y)
print("val_y++++++:")
print(val_y)
print("val_y++++++:")
print('预测的概率矩阵为:')
print(test_pred)
test += test_pred
score=abs_sum(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
#返回数组元素的标准差
# a = np.array([[1, 2], [3, 4]])
# print(np.std(a)) # 计算全局标准差 1.118033988749895
# print(np.std(a, axis=0)) # axis=0计算每一列的标准差 [1. 1.] 平均值2,3 :((1-2)^ +(3-2)^)/2=第一列标准差^2
# print(np.std(a, axis=1)) # 计算每一行的标准差 [0.5 0.5]
#
print("%s_score_std:" % clf_name, np.std(cv_scores))
test=test/kf.n_splits
return test
def lgb_model(x_train, y_train, x_test):
lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_test
lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0525735
[200] valid_0's multi_logloss: 0.0422444
[300] valid_0's multi_logloss: 0.0407076
[400] valid_0's multi_logloss: 0.0420398
Early stopping, best iteration is:
[289] valid_0's multi_logloss: 0.0405457
val_y++++++:
[[2.]
[0.]
[2.]
...
[0.]
[2.]
[2.]]
val_y++++++:
[[0. 0. 1. 0.]
[1. 0. 0. 0.]
[0. 0. 1. 0.]
...
[1. 0. 0. 0.]
[0. 0. 1. 0.]
[0. 0. 1. 0.]]
val_y++++++:
预测的概率矩阵为:
[[9.99969791e-01 2.85197261e-05 1.00341946e-06 6.85357631e-07]
[7.93287264e-05 7.69060914e-04 9.99151590e-01 2.00810971e-08]
[5.75356884e-07 5.04051497e-08 3.15322414e-07 9.99999059e-01]
...
[6.79267940e-02 4.30206297e-04 9.31640185e-01 2.81516302e-06]
[9.99960477e-01 3.94098074e-05 8.34030725e-08 2.94638661e-08]
[9.88705846e-01 2.14081630e-03 6.67418381e-03 2.47915423e-03]]
[607.0736049372185]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0566626
[200] valid_0's multi_logloss: 0.0450852
[300] valid_0's multi_logloss: 0.044078
[400] valid_0's multi_logloss: 0.0455546
Early stopping, best iteration is:
[275] valid_0's multi_logloss: 0.0437793
val_y++++++:
[[2.]
[3.]
[3.]
...
[0.]
[0.]
[0.]]
val_y++++++:
[[0. 0. 1. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 1.]
...
[1. 0. 0. 0.]
[1. 0. 0. 0.]
[1. 0. 0. 0.]]
val_y++++++:
预测的概率矩阵为:
[[9.99991401e-01 7.69109547e-06 6.65504756e-07 2.42084688e-07]
[5.72380482e-05 1.32812809e-03 9.98614607e-01 2.66534396e-08]
[2.82123411e-06 4.13195205e-07 1.34026965e-06 9.99995425e-01]
...
[6.96398024e-02 6.52459907e-04 9.29685742e-01 2.19960932e-05]
[9.99972366e-01 2.75069005e-05 7.68142933e-08 5.07415018e-08]
[9.67263676e-01 7.26154408e-03 2.41533542e-02 1.32142531e-03]]
[607.0736049372185, 623.4313863731124]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0498722
[200] valid_0's multi_logloss: 0.038028
[300] valid_0's multi_logloss: 0.0358066
[400] valid_0's multi_logloss: 0.0361478
[500] valid_0's multi_logloss: 0.0379597
Early stopping, best iteration is:
[340] valid_0's multi_logloss: 0.0354344
val_y++++++:
[[0.]
[2.]
[0.]
...
[2.]
[3.]
[0.]]
val_y++++++:
[[1. 0. 0. 0.]
[0. 0. 1. 0.]
[1. 0. 0. 0.]
...
[0. 0. 1. 0.]
[0. 0. 0. 1.]
[1. 0. 0. 0.]]
val_y++++++:
预测的概率矩阵为:
[[9.99972032e-01 2.62406774e-05 1.17282152e-06 5.54230651e-07]
[1.05242811e-05 6.50215805e-05 9.99924453e-01 6.93812546e-10]
[1.93240868e-06 1.10384984e-07 3.76773426e-07 9.99997580e-01]
...
[1.34894410e-02 3.84569683e-05 9.86471555e-01 5.46564350e-07]
[9.99987431e-01 1.25532882e-05 1.03902298e-08 5.46727770e-09]
[9.78722948e-01 1.06329839e-02 6.94192038e-03 3.70214810e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0564768
[200] valid_0's multi_logloss: 0.0448698
[300] valid_0's multi_logloss: 0.0446719
[400] valid_0's multi_logloss: 0.0470399
Early stopping, best iteration is:
[250] valid_0's multi_logloss: 0.0438853
val_y++++++:
[[0.]
[0.]
[0.]
...
[3.]
[0.]
[1.]]
val_y++++++:
[[1. 0. 0. 0.]
[1. 0. 0. 0.]
[1. 0. 0. 0.]
...
[0. 0. 0. 1.]
[1. 0. 0. 0.]
[0. 1. 0. 0.]]
val_y++++++:
预测的概率矩阵为:
[[9.99979692e-01 1.70821979e-05 1.27048476e-06 1.95571841e-06]
[5.66207785e-05 4.02275314e-04 9.99541086e-01 1.82828519e-08]
[2.62267451e-06 3.58613522e-07 4.78645006e-06 9.99992232e-01]
...
[4.56636552e-02 5.69497433e-04 9.53758468e-01 8.37980573e-06]
[9.99896785e-01 1.02796802e-04 2.46636563e-07 1.72061021e-07]
[8.70911669e-01 1.73790185e-02 1.04478175e-01 7.23113697e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0506398
[200] valid_0's multi_logloss: 0.0396422
[300] valid_0's multi_logloss: 0.0381065
[400] valid_0's multi_logloss: 0.0390162
[500] valid_0's multi_logloss: 0.0414986
Early stopping, best iteration is:
[324] valid_0's multi_logloss: 0.0379497
val_y++++++:
[[2.]
[0.]
[0.]
...
[0.]
[0.]
[0.]]
val_y++++++:
[[0. 0. 1. 0.]
[1. 0. 0. 0.]
[1. 0. 0. 0.]
...
[1. 0. 0. 0.]
[1. 0. 0. 0.]
[1. 0. 0. 0.]]
val_y++++++:
预测的概率矩阵为:
[[9.99993352e-01 6.02902202e-06 1.13002685e-07 5.06277302e-07]
[1.03959552e-05 5.03778956e-04 9.99485820e-01 5.07638601e-09]
[1.92568065e-07 5.07155306e-08 4.94690856e-08 9.99999707e-01]
...
[8.83103121e-03 2.51969353e-05 9.91142776e-01 9.96143937e-07]
[9.99984791e-01 1.51997858e-05 5.62426491e-09 3.80450197e-09]
[9.86084001e-01 8.75968498e-04 1.09742304e-02 2.06580027e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266, 539.2160054696064]
lgb_scotrainre_list: [607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266, 539.2160054696064]
lgb_score_mean: 587.6463107214719
lgb_score_std: 55.944536405714565
temp=pd.DataFrame(lgb_test)
result=pd.read_csv('sample_submit.csv')
result['label_0']=temp[0]
result['label_1']=temp[1]
result['label_2']=temp[2]
result['label_3']=temp[3]
result.to_csv('submit_baseline.csv',index=False)
submit_data=pd.read_csv('submit_baseline.csv')
submit_data
id | label_0 | label_1 | label_2 | label_3 | |
---|---|---|---|---|---|
0 | 100000 | 0.999981 | 1.711254e-05 | 8.450466e-07 | 7.887337e-07 |
1 | 100001 | 0.000043 | 6.136530e-04 | 9.993435e-01 | 1.415752e-08 |
2 | 100002 | 0.000002 | 1.966629e-07 | 1.373657e-06 | 9.999968e-01 |
3 | 100003 | 0.999970 | 1.909713e-05 | 1.097002e-05 | 3.576703e-08 |
4 | 100004 | 0.999983 | 1.769712e-06 | 1.482817e-05 | 1.966254e-07 |
... | ... | ... | ... | ... | ... |
19995 | 119995 | 0.998096 | 3.060176e-04 | 1.085313e-04 | 1.489757e-03 |
19996 | 119996 | 0.999846 | 1.436305e-04 | 1.074898e-05 | 8.837766e-08 |
19997 | 119997 | 0.041110 | 3.431635e-04 | 9.585397e-01 | 6.946754e-06 |
19998 | 119998 | 0.999960 | 3.949332e-05 | 8.457368e-08 | 5.230763e-08 |
19999 | 119999 | 0.958338 | 7.658066e-03 | 3.064437e-02 | 3.359933e-03 |
20000 rows × 5 columns
submit_data.to_csv('submit_baseline_v1.csv',index=False)
for index,row in submit_data.iterrows():
row_max = max(list(row)[1:])
if row_max > 0.9:
for i in range(1,5):
if row[i]>0.9:
submit_data.iloc[index,i] = 1
else:
submit_data.iloc[index,i] = 0
submit_data
id | label_0 | label_1 | label_2 | label_3 | |
---|---|---|---|---|---|
0 | 100000 | 1.0 | 0.0 | 0.0 | 0.0 |
1 | 100001 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 100002 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 100003 | 1.0 | 0.0 | 0.0 | 0.0 |
4 | 100004 | 1.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... |
19995 | 119995 | 1.0 | 0.0 | 0.0 | 0.0 |
19996 | 119996 | 1.0 | 0.0 | 0.0 | 0.0 |
19997 | 119997 | 0.0 | 0.0 | 1.0 | 0.0 |
19998 | 119998 | 1.0 | 0.0 | 0.0 | 0.0 |
19999 | 119999 | 1.0 | 0.0 | 0.0 | 0.0 |
20000 rows × 5 columns
submit_data.to_csv('submit_baseline_v1.csv',index=False)