另有题目文件,第四题第五题全部可实现文件均放在了我的资源里面
任务1.1(1)首先,使用pandas将数据导入,查看数据的详细信息,发现education、default、marital、job等字段存在明显缺失值,使用dropna()删除缺失值所在的行;其次查看数据是否存在重复值,整体不存在重复值,单独查看‘user_id’发现,存在56个重复值,drop_duplicates()删除重复值所在的行;最后 将处理好的数据保存到“result1_1.xlsx”中。
import pandas as pd
import numpy as np
import re
#导入数据
short_data1=pd.read_csv('short-customer-data.csv')
short_data1
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | housemaid | married | postgraduate | no | no | no | telephone | may | mon | 261 | nonexistent | no |
1 | BA2200002 | 57 | services | married | high school | NaN | no | no | telephone | may | mon | 149 | nonexistent | no |
2 | BA2200077 | 37 | services | married | high school | no | yes | no | telephone | may | mon | 226 | nonexistent | no |
3 | BA2200004 | 40 | admin. | married | postgraduate | no | no | no | telephone | may | mon | 151 | nonexistent | no |
4 | BA2200005 | 56 | services | married | high school | no | no | yes | telephone | may | mon | 307 | nonexistent | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41171 | BA2241172 | 73 | retired | married | junior college | no | yes | no | cellular | nov | fri | 334 | nonexistent | yes |
41172 | BA2241173 | 46 | blue-collar | married | junior college | no | no | no | cellular | nov | fri | 383 | nonexistent | no |
41173 | BA2241174 | 56 | retired | married | undergraduate | no | yes | no | cellular | nov | fri | 189 | nonexistent | no |
41174 | BA2241175 | 44 | technician | married | junior college | no | no | no | cellular | nov | fri | 442 | nonexistent | yes |
41175 | BA2241176 | 74 | retired | married | junior college | no | yes | no | cellular | nov | fri | 239 | failure | no |
41176 rows × 14 columns
#查看数据详细信息
short_data1.info()
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 41176 non-null object
1 age 41176 non-null int64
2 job 40846 non-null object
3 marital 41096 non-null object
4 education 39446 non-null object
5 default 32580 non-null object
6 housing 40186 non-null object
7 loan 40186 non-null object
8 contact 41176 non-null object
9 month 41176 non-null object
10 day_of_week 41176 non-null object
11 duration 41176 non-null int64
12 poutcome 41176 non-null object
13 y 41176 non-null object
dtypes: int64(2), object(12)
memory usage: 4.4+ MB
#查看是否存在重复值
short_data1.duplicated().sum()
0
#查看'user_id'是否存在重复值
short_data1['user_id'].duplicated().sum()
56
#删除重复值
short_data1=short_data1.drop_duplicates(subset=['user_id'])
short_data1
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | housemaid | married | postgraduate | no | no | no | telephone | may | mon | 261 | nonexistent | no |
1 | BA2200002 | 57 | services | married | high school | NaN | no | no | telephone | may | mon | 149 | nonexistent | no |
2 | BA2200077 | 37 | services | married | high school | no | yes | no | telephone | may | mon | 226 | nonexistent | no |
3 | BA2200004 | 40 | admin. | married | postgraduate | no | no | no | telephone | may | mon | 151 | nonexistent | no |
4 | BA2200005 | 56 | services | married | high school | no | no | yes | telephone | may | mon | 307 | nonexistent | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41170 | BA2241171 | 29 | unemployed | single | postgraduate | no | yes | no | cellular | nov | fri | 112 | success | no |
41172 | BA2241173 | 46 | blue-collar | married | junior college | no | no | no | cellular | nov | fri | 383 | nonexistent | no |
41173 | BA2241174 | 56 | retired | married | undergraduate | no | yes | no | cellular | nov | fri | 189 | nonexistent | no |
41174 | BA2241175 | 44 | technician | married | junior college | no | no | no | cellular | nov | fri | 442 | nonexistent | yes |
41175 | BA2241176 | 74 | retired | married | junior college | no | yes | no | cellular | nov | fri | 239 | failure | no |
41120 rows × 14 columns
#查看重复值是否删除
short_data1['user_id'].duplicated().sum()
0
#查看缺失值
short_data1.isnull().sum()
user_id 0
age 0
job 328
marital 80
education 1726
default 8578
housing 988
loan 988
contact 0
month 0
day_of_week 0
duration 0
poutcome 0
y 0
dtype: int64
#删除缺失值所在的行
short_data1=short_data1.dropna()
short_data1
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | housemaid | married | postgraduate | no | no | no | telephone | may | mon | 261 | nonexistent | no |
2 | BA2200077 | 37 | services | married | high school | no | yes | no | telephone | may | mon | 226 | nonexistent | no |
3 | BA2200004 | 40 | admin. | married | postgraduate | no | no | no | telephone | may | mon | 151 | nonexistent | no |
4 | BA2200005 | 56 | services | married | high school | no | no | yes | telephone | may | mon | 307 | nonexistent | no |
6 | BA2200007 | 59 | admin. | married | junior college | no | no | no | telephone | may | mon | 139 | nonexistent | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41170 | BA2241171 | 29 | unemployed | single | postgraduate | no | yes | no | cellular | nov | fri | 112 | success | no |
41172 | BA2241173 | 46 | blue-collar | married | junior college | no | no | no | cellular | nov | fri | 383 | nonexistent | no |
41173 | BA2241174 | 56 | retired | married | undergraduate | no | yes | no | cellular | nov | fri | 189 | nonexistent | no |
41174 | BA2241175 | 44 | technician | married | junior college | no | no | no | cellular | nov | fri | 442 | nonexistent | yes |
41175 | BA2241176 | 74 | retired | married | junior college | no | yes | no | cellular | nov | fri | 239 | failure | no |
30444 rows × 14 columns
#查看缺失值是否删除
short_data1.isnull().sum()
user_id 0
age 0
job 0
marital 0
education 0
default 0
housing 0
loan 0
contact 0
month 0
day_of_week 0
duration 0
poutcome 0
y 0
dtype: int64
#将处理好的数据导出
short_data1.to_excel('result1_1.xlsx',index = False)
short_data1
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | housemaid | married | postgraduate | no | no | no | telephone | may | mon | 261 | nonexistent | no |
2 | BA2200077 | 37 | services | married | high school | no | yes | no | telephone | may | mon | 226 | nonexistent | no |
3 | BA2200004 | 40 | admin. | married | postgraduate | no | no | no | telephone | may | mon | 151 | nonexistent | no |
4 | BA2200005 | 56 | services | married | high school | no | no | yes | telephone | may | mon | 307 | nonexistent | no |
6 | BA2200007 | 59 | admin. | married | junior college | no | no | no | telephone | may | mon | 139 | nonexistent | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41170 | BA2241171 | 29 | unemployed | single | postgraduate | no | yes | no | cellular | nov | fri | 112 | success | no |
41172 | BA2241173 | 46 | blue-collar | married | junior college | no | no | no | cellular | nov | fri | 383 | nonexistent | no |
41173 | BA2241174 | 56 | retired | married | undergraduate | no | yes | no | cellular | nov | fri | 189 | nonexistent | no |
41174 | BA2241175 | 44 | technician | married | junior college | no | no | no | cellular | nov | fri | 442 | nonexistent | yes |
41175 | BA2241176 | 74 | retired | married | junior college | no | yes | no | cellular | nov | fri | 239 | failure | no |
30444 rows × 14 columns
任务1.1.2
import pandas as pd
import numpy as np
import re
long_data1=pd.read_csv('long-customer-train.csv')
long_data1.info()
RangeIndex: 9300 entries, 0 to 9299
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerId 9300 non-null int64
1 CreditScore 9300 non-null int64
2 Gender 9300 non-null int64
3 Age 9300 non-null object
4 Tenure 9300 non-null int64
5 Balance 9300 non-null float64
6 NumOfProducts 9300 non-null int64
7 HasCrCard 9300 non-null int64
8 IsActiveMember 9300 non-null int64
9 EstimatedSalary 9300 non-null float64
10 Exited 9300 non-null int64
dtypes: float64(2), int64(8), object(1)
memory usage: 799.3+ KB
long_data1[long_data1['Age']=='-']
CustomerId | CreditScore | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|
464 | 15570485 | 558 | 0 | - | 4 | 161766.87 | 1 | 0 | 0 | 92378.54 | 0 |
566 | 15573452 | 663 | 0 | - | 7 | 115930.87 | 1 | 1 | 0 | 19862.78 | 0 |
651 | 15576000 | 765 | 0 | - | 6 | 138033.55 | 1 | 1 | 1 | 67972.45 | 0 |
696 | 15577064 | 592 | 0 | - | 2 | 104702.65 | 2 | 1 | 0 | 107948.72 | 0 |
796 | 15580068 | 526 | 0 | - | 5 | 0.00 | 2 | 1 | 1 | 105618.14 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8176 | 15784161 | 583 | 0 | - | 8 | 102945.01 | 1 | 0 | 0 | 52861.89 | 0 |
8270 | 15787035 | 602 | 1 | - | 8 | 0.00 | 2 | 1 | 1 | 152843.53 | 0 |
8591 | 15795881 | 776 | 0 | - | 8 | 106365.29 | 1 | 1 | 1 | 148527.56 | 0 |
8776 | 15801062 | 557 | 1 | - | 4 | 0.00 | 2 | 0 | 1 | 105433.53 | 0 |
8794 | 15801417 | 657 | 0 | - | 4 | 82500.28 | 1 | 1 | 1 | 115260.72 | 0 |
78 rows × 11 columns
长期数据中的客户年龄“Age”列存在数值为-1、0 和“-”的异常值,删除存在该情况的行数据。
这里我们先将异常值赋值为空值
long_data2=long_data1.replace(to_replace=['-1','0','-','1'],value=[np.nan,np.nan,np.nan,np.nan])
long_data2.info()
RangeIndex: 9300 entries, 0 to 9299
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerId 9300 non-null int64
1 CreditScore 9300 non-null int64
2 Gender 9300 non-null int64
3 Age 9180 non-null object
4 Tenure 9300 non-null int64
5 Balance 9300 non-null float64
6 NumOfProducts 9300 non-null int64
7 HasCrCard 9300 non-null int64
8 IsActiveMember 9300 non-null int64
9 EstimatedSalary 9300 non-null float64
10 Exited 9300 non-null int64
dtypes: float64(2), int64(8), object(1)
memory usage: 799.3+ KB
再用删除空值的方法将其所在的行删除
long_data3=long_data2.dropna()
long_data3.info()
Int64Index: 9180 entries, 0 to 9299
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerId 9180 non-null int64
1 CreditScore 9180 non-null int64
2 Gender 9180 non-null int64
3 Age 9180 non-null object
4 Tenure 9180 non-null int64
5 Balance 9180 non-null float64
6 NumOfProducts 9180 non-null int64
7 HasCrCard 9180 non-null int64
8 IsActiveMember 9180 non-null int64
9 EstimatedSalary 9180 non-null float64
10 Exited 9180 non-null int64
dtypes: float64(2), int64(8), object(1)
memory usage: 860.6+ KB
# long_data3['age_new']=long_data3.Age.str.extract(r"(\d+)")
# #long_data3['new_Age']=long_data3.Age.str.replace('岁','')
# #long_data3[long_data3.age_new.str.contains(" ")]
# long_data3['age_new']=long_data3['age_new'].astype(int)
# long_data4=long_data3.drop(columns=['Age'])
# long_data4.rename(columns={'age_new':'Age'},inplace=True)
# long_data4.info()
“Age”列存在空格和“岁”等异常字符,删除这些异常
字符但须保留年龄数值,将处理后的数值存于“Age”列。
这里通过正则表达式匹配异常字符中的正确年龄,并将年龄保存在了Age列,数据类型用int
long_data3['Age']=long_data3.Age.str.extract(r"(\d+)")
long_data3['Age']=long_data3['Age'].astype(int)
C:\Users\31214\AppData\Local\Temp\ipykernel_10860\3539377572.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
long_data3['Age']=long_data3.Age.str.extract(r"(\d+)")
C:\Users\31214\AppData\Local\Temp\ipykernel_10860\3539377572.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
long_data3['Age']=long_data3['Age'].astype(int)
long_data3.to_excel('result1_2.xlsx',index = False)
long_data3.info()
Int64Index: 9180 entries, 0 to 9299
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerId 9180 non-null int64
1 CreditScore 9180 non-null int64
2 Gender 9180 non-null int64
3 Age 9180 non-null int32
4 Tenure 9180 non-null int64
5 Balance 9180 non-null float64
6 NumOfProducts 9180 non-null int64
7 HasCrCard 9180 non-null int64
8 IsActiveMember 9180 non-null int64
9 EstimatedSalary 9180 non-null float64
10 Exited 9180 non-null int64
dtypes: float64(2), int32(1), int64(8)
memory usage: 824.8 KB
long_data3
CustomerId | CreditScore | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 15553251 | 713 | 1 | 52 | 0 | 185891.54 | 1 | 1 | 1 | 46369.57 | 1 |
1 | 15553256 | 619 | 1 | 41 | 8 | 0.00 | 3 | 1 | 1 | 79866.73 | 1 |
2 | 15553283 | 603 | 1 | 42 | 8 | 91611.12 | 1 | 0 | 0 | 144675.30 | 1 |
3 | 15553308 | 589 | 1 | 61 | 1 | 0.00 | 1 | 1 | 0 | 61108.56 | 1 |
4 | 15553387 | 687 | 1 | 39 | 2 | 0.00 | 3 | 0 | 0 | 188150.60 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9295 | 15815628 | 711 | 1 | 37 | 8 | 113899.92 | 1 | 0 | 0 | 80215.20 | 0 |
9296 | 15815645 | 481 | 0 | 37 | 8 | 152303.66 | 2 | 1 | 1 | 175082.20 | 0 |
9297 | 15815656 | 541 | 1 | 39 | 9 | 100116.67 | 1 | 1 | 1 | 199808.10 | 1 |
9298 | 15815660 | 758 | 1 | 34 | 1 | 154139.45 | 1 | 1 | 1 | 60728.89 | 0 |
9299 | 15815690 | 614 | 1 | 40 | 3 | 113348.50 | 1 | 1 | 1 | 77789.01 | 0 |
9180 rows × 11 columns
任务1.2
import pandas as pd
short_data=pd.read_excel('result1_1.xlsx')
short_data
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | housemaid | married | postgraduate | no | no | no | telephone | may | mon | 261 | nonexistent | no |
1 | BA2200077 | 37 | services | married | high school | no | yes | no | telephone | may | mon | 226 | nonexistent | no |
2 | BA2200004 | 40 | admin. | married | postgraduate | no | no | no | telephone | may | mon | 151 | nonexistent | no |
3 | BA2200005 | 56 | services | married | high school | no | no | yes | telephone | may | mon | 307 | nonexistent | no |
4 | BA2200007 | 59 | admin. | married | junior college | no | no | no | telephone | may | mon | 139 | nonexistent | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
30439 | BA2241171 | 29 | unemployed | single | postgraduate | no | yes | no | cellular | nov | fri | 112 | success | no |
30440 | BA2241173 | 46 | blue-collar | married | junior college | no | no | no | cellular | nov | fri | 383 | nonexistent | no |
30441 | BA2241174 | 56 | retired | married | undergraduate | no | yes | no | cellular | nov | fri | 189 | nonexistent | no |
30442 | BA2241175 | 44 | technician | married | junior college | no | no | no | cellular | nov | fri | 442 | nonexistent | yes |
30443 | BA2241176 | 74 | retired | married | junior college | no | yes | no | cellular | nov | fri | 239 | failure | no |
30444 rows × 14 columns
对短期数据中的字符型数据进行特征编码,如将信用违约情况
{‘否’,‘是’}编码为{0,1}
在这种情况下,我们必须将数据进行编码,即是说,将文字型数据转换为数值型。
这里我们采用preprocessing.LabelEncoder将汉字进行编码
短期客户数据中,需要进行特征编码的字段都包含{job,marital,education,default,housing,loan,contact,poutcome,y}
from sklearn.preprocessing import LabelEncoder
short_data6=short_data.copy()
short_data6
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | housemaid | married | postgraduate | no | no | no | telephone | may | mon | 261 | nonexistent | no |
1 | BA2200077 | 37 | services | married | high school | no | yes | no | telephone | may | mon | 226 | nonexistent | no |
2 | BA2200004 | 40 | admin. | married | postgraduate | no | no | no | telephone | may | mon | 151 | nonexistent | no |
3 | BA2200005 | 56 | services | married | high school | no | no | yes | telephone | may | mon | 307 | nonexistent | no |
4 | BA2200007 | 59 | admin. | married | junior college | no | no | no | telephone | may | mon | 139 | nonexistent | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
30439 | BA2241171 | 29 | unemployed | single | postgraduate | no | yes | no | cellular | nov | fri | 112 | success | no |
30440 | BA2241173 | 46 | blue-collar | married | junior college | no | no | no | cellular | nov | fri | 383 | nonexistent | no |
30441 | BA2241174 | 56 | retired | married | undergraduate | no | yes | no | cellular | nov | fri | 189 | nonexistent | no |
30442 | BA2241175 | 44 | technician | married | junior college | no | no | no | cellular | nov | fri | 442 | nonexistent | yes |
30443 | BA2241176 | 74 | retired | married | junior college | no | yes | no | cellular | nov | fri | 239 | failure | no |
30444 rows × 14 columns
t1=short_data6.loc[:,'job'] #要输入的是标签,不是特征矩阵,所以允许一维数据
t2=short_data6.loc[:,'marital']
t3=short_data6.loc[:,'education']
t4=short_data6.loc[:,'default']
t5=short_data6.loc[:,'housing']
t6=short_data6.loc[:,'loan']
t7=short_data6.loc[:,'contact']
t8=short_data6.loc[:,'poutcome']
t9=short_data6.loc[:,'y']
le1 = LabelEncoder() #实例化
le1 = le1.fit(t1) # 导入数据
label1 = le1.transform(t1) # transform接口调取结果
short_data6.loc[:,"job"] = label1
short_data6['job'].unique()
array([ 3, 7, 0, 9, 1, 10, 5, 2, 4, 8, 6])
le2 = LabelEncoder() #实例化
le2 = le2.fit(t2) # 导入数据
label2 = le2.transform(t2) # transform接口调取结果
short_data6.loc[:,"marital"] = label2
short_data6['marital'].unique()
array([1, 2, 0])
le3 = LabelEncoder() #实例化
le3 = le3.fit(t3) # 导入数据
label3 = le3.transform(t3) # transform接口调取结果
short_data6.loc[:,"education"] = label3
short_data6['education'].unique()
array([3, 0, 2, 4, 1])
le4 = LabelEncoder() #实例化
le4 = le4.fit(t4) # 导入数据
label4 = le4.transform(t4) # transform接口调取结果
short_data6.loc[:,"default"] = label4
short_data6['default'].unique()
array([0, 1])
le5 = LabelEncoder() #实例化
le5 = le5.fit(t5) # 导入数据
label5 = le5.transform(t5) # transform接口调取结果
short_data6.loc[:,"housing"] = label5
short_data6['housing'].unique()
array([0, 1])
le6 = LabelEncoder() #实例化
le6 = le6.fit(t6) # 导入数据
label6 = le6.transform(t6) # transform接口调取结果
short_data6.loc[:,"loan"] = label6
short_data6['loan'].unique()
array([0, 1])
le7 = LabelEncoder() #实例化
le7 = le7.fit(t7) # 导入数据
label7 = le7.transform(t7) # transform接口调取结果
short_data6.loc[:,"contact"] = label7
short_data6['contact'].unique()
array([1, 0])
le8 = LabelEncoder() #实例化
le8 = le8.fit(t8) # 导入数据
label8 = le8.transform(t8) # transform接口调取结果
short_data6.loc[:,"poutcome"] = label8
short_data6['poutcome'].unique()
array([1, 0, 2])
le9 = LabelEncoder() #实例化
le9 = le9.fit(t9) # 导入数据
label9 = le9.transform(t9) # transform接口调取结果
short_data6.loc[:,"y"] = label9
short_data6['y'].unique()
array([0, 1])
short_data7=short_data6.copy()
short_data7
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | 3 | 1 | 3 | 0 | 0 | 0 | 1 | may | mon | 261 | 1 | 0 |
1 | BA2200077 | 37 | 7 | 1 | 0 | 0 | 1 | 0 | 1 | may | mon | 226 | 1 | 0 |
2 | BA2200004 | 40 | 0 | 1 | 3 | 0 | 0 | 0 | 1 | may | mon | 151 | 1 | 0 |
3 | BA2200005 | 56 | 7 | 1 | 0 | 0 | 0 | 1 | 1 | may | mon | 307 | 1 | 0 |
4 | BA2200007 | 59 | 0 | 1 | 2 | 0 | 0 | 0 | 1 | may | mon | 139 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
30439 | BA2241171 | 29 | 10 | 2 | 3 | 0 | 1 | 0 | 0 | nov | fri | 112 | 2 | 0 |
30440 | BA2241173 | 46 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | nov | fri | 383 | 1 | 0 |
30441 | BA2241174 | 56 | 5 | 1 | 4 | 0 | 1 | 0 | 0 | nov | fri | 189 | 1 | 0 |
30442 | BA2241175 | 44 | 9 | 1 | 2 | 0 | 0 | 0 | 0 | nov | fri | 442 | 1 | 1 |
30443 | BA2241176 | 74 | 5 | 1 | 2 | 0 | 1 | 0 | 0 | nov | fri | 239 | 0 | 0 |
30444 rows × 14 columns
#short_data7.loc[short_data7['month']=='mar']['month'].unique()
short_data6['month'].unique()
array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr',
'sep'], dtype=object)
对上次拜访客户的月份(month)和最近拜访客户的月份(day_of_week)将不适用LabelEncoder库进行编码,因此这里将手动赋值,例如:一月:1,星期一:1
short_data8=short_data7.replace(to_replace=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],value=['1','2','3','4','5','6','7','8','9','10','11','12'])
short_data8
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | 3 | 1 | 3 | 0 | 0 | 0 | 1 | 5 | mon | 261 | 1 | 0 |
1 | BA2200077 | 37 | 7 | 1 | 0 | 0 | 1 | 0 | 1 | 5 | mon | 226 | 1 | 0 |
2 | BA2200004 | 40 | 0 | 1 | 3 | 0 | 0 | 0 | 1 | 5 | mon | 151 | 1 | 0 |
3 | BA2200005 | 56 | 7 | 1 | 0 | 0 | 0 | 1 | 1 | 5 | mon | 307 | 1 | 0 |
4 | BA2200007 | 59 | 0 | 1 | 2 | 0 | 0 | 0 | 1 | 5 | mon | 139 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
30439 | BA2241171 | 29 | 10 | 2 | 3 | 0 | 1 | 0 | 0 | 11 | fri | 112 | 2 | 0 |
30440 | BA2241173 | 46 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 11 | fri | 383 | 1 | 0 |
30441 | BA2241174 | 56 | 5 | 1 | 4 | 0 | 1 | 0 | 0 | 11 | fri | 189 | 1 | 0 |
30442 | BA2241175 | 44 | 9 | 1 | 2 | 0 | 0 | 0 | 0 | 11 | fri | 442 | 1 | 1 |
30443 | BA2241176 | 74 | 5 | 1 | 2 | 0 | 1 | 0 | 0 | 11 | fri | 239 | 0 | 0 |
30444 rows × 14 columns
short_data8['day_of_week'].unique()
array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object)
short_data9=short_data8.replace(to_replace=['mon','tue','wed','thu','fri'],value=[1,2,3,4,5])
short_data9
user_id | age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BA2200001 | 56 | 3 | 1 | 3 | 0 | 0 | 0 | 1 | 5 | 1 | 261 | 1 | 0 |
1 | BA2200077 | 37 | 7 | 1 | 0 | 0 | 1 | 0 | 1 | 5 | 1 | 226 | 1 | 0 |
2 | BA2200004 | 40 | 0 | 1 | 3 | 0 | 0 | 0 | 1 | 5 | 1 | 151 | 1 | 0 |
3 | BA2200005 | 56 | 7 | 1 | 0 | 0 | 0 | 1 | 1 | 5 | 1 | 307 | 1 | 0 |
4 | BA2200007 | 59 | 0 | 1 | 2 | 0 | 0 | 0 | 1 | 5 | 1 | 139 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
30439 | BA2241171 | 29 | 10 | 2 | 3 | 0 | 1 | 0 | 0 | 11 | 5 | 112 | 2 | 0 |
30440 | BA2241173 | 46 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 11 | 5 | 383 | 1 | 0 |
30441 | BA2241174 | 56 | 5 | 1 | 4 | 0 | 1 | 0 | 0 | 11 | 5 | 189 | 1 | 0 |
30442 | BA2241175 | 44 | 9 | 1 | 2 | 0 | 0 | 0 | 0 | 11 | 5 | 442 | 1 | 1 |
30443 | BA2241176 | 74 | 5 | 1 | 2 | 0 | 1 | 0 | 0 | 11 | 5 | 239 | 0 | 0 |
30444 rows × 14 columns
short_data9.to_excel("result1_3.xlsx",encoding = 'openpyxl',index = False)