这是我们本次算法实践进阶数据的下载地址 https://pan.baidu.com/s/1wO9qJRjnrm8uhaSP67K0lw
说明:这份数据集是金融数据(非原始数据,已经处理过了),我们要做的是预测贷款用户是否会逾期。表格中 “status” 是结果标签:0表示未逾期,1表示逾期。
数据类型转换和缺失值处理(尝试不同的填充看效果)以及及其他你能借鉴的数据探索。
import pandas as pd
data_all = pd.read_csv('data.csv')
# 删除无关数据
data_all = data_all.drop(['custid', 'trade_no', 'bank_card_no', 'id_name'], axis=1)
#删除重复数据
X = data_all.drop(labels='status',axis=1)
L = []
for col in X:
if len(X[col].unique()) == 1:
L.append(col)
for col in L:
X.drop(col, axis=1, inplace=True)
# 查看缺失数据
print(data_all.isnull().sum())
Unnamed: 0 0
low_volume_percent 2
middle_volume_percent 2
take_amount_in_later_12_month_highest 0
trans_amount_increase_rate_lately 3
trans_activity_month 2
trans_activity_day 2
transd_mcc 2
trans_days_interval_filter 8
trans_days_interval 2
regional_mobility 2
student_feature 2998
repayment_capability 0
is_high_user 0
number_of_trans_from_2011 2
first_transaction_time 2
historical_trans_amount 0
historical_trans_day 2
rank_trad_1_month 2
trans_amount_3_month 0
avg_consume_less_12_valid_month 2
abs 0
top_trans_count_last_1_month 2
avg_price_last_12_month 0
avg_price_top_last_12_valid_month 104
reg_preference_for_trad 2
trans_top_time_last_1_month 8
trans_top_time_last_6_month 8
consume_top_time_last_1_month 8
consume_top_time_last_6_month 8
...
loans_credibility_behavior 297
loans_count 297
loans_settle_count 297
loans_overdue_count 297
loans_org_count_behavior 297
consfin_org_count_behavior 297
loans_cash_count 297
latest_one_month_loan 297
latest_three_month_loan 297
latest_six_month_loan 297
history_suc_fee 297
history_fail_fee 297
latest_one_month_suc 297
latest_one_month_fail 297
loans_long_time 297
loans_latest_time 297
loans_credit_limit 297
loans_credibility_limit 297
loans_org_count_current 297
loans_product_count 297
loans_max_limit 297
loans_avg_limit 297
consfin_credit_limit 297
consfin_credibility 297
consfin_org_count_current 297
consfin_product_count 297
consfin_max_limit 297
consfin_avg_limit 297
latest_query_day 304
loans_latest_day 297
Length: 86, dtype: int64
#用0填充
data_all.fillna(0)
#用均值填充
data_all.fillna(data_all.mean())
data_all.dropna()
# 将汉字转为数字
data_all['reg_preference_for_trad'] = data_all['reg_preference_for_trad'].map({'境外':0,'一线城市':1, '二线城市':2, '三线城市':3})
# 细分为年、月、日
data_all['latest_query_time'] = pd.to_datetime(data_all['latest_query_time'])
data_all['latest_query_time_year'] = data_all['latest_query_time'].dt.year
data_all['latest_query_time_month'] = data_all['latest_query_time'].dt.month
data_all['latest_query_time_day'] = data_all['latest_query_time'].dt.day
data_all['loans_latest_time'] = pd.to_datetime(data_all['loans_latest_time'])
data_all['loans_latest_time_year'] = data_all['loans_latest_time'].dt.year
data_all['loans_latest_time_month'] = data_all['loans_latest_time'].dt.month
data_all['loans_latest_time_day'] = data_all['loans_latest_time'].dt.day
data_all.drop(labels=['latest_query_time', 'loans_latest_time'], axis=1, inplace=True)
# 对日期缺失值进行众数填充
# 常用方法有三种:删除,补全和忽略
data_all['latest_query_time_year'].fillna(data_all['latest_query_time_year'].mode(), inplace=True)
data_all['latest_query_time_month'].fillna(data_all['latest_query_time_month'].mode(), inplace=True)
data_all['latest_query_time_day'].fillna(data_all['latest_query_time_day'].mode(), inplace=True)
data_all['loans_latest_time_year'].fillna(data_all['loans_latest_time_year'].mode(), inplace=True)
data_all['loans_latest_time_month'].fillna(data_all['loans_latest_time_month'].mode(), inplace=True)
data_all['loans_latest_time_day'].fillna(data_all['loans_latest_time_day'].mode(), inplace=True)
from sklearn.model_selection import train_test_split
x = data_all.drop(columns=["status"]).as_matrix()
y = data_all[["status"]].as_matrix()
y = y.ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)