简单的违约特征探索

本文章数据截取自一家银行的真实客户与交易数据;涉及客户主记录、帐号、交易、业务和信用卡数据;下载地址:https://pan.baidu.com/s/1mjg8OGS

Note:

本文同时用了dplyr包(处理速度很快)和sqldf包(sql语句写起来比较方便,但是速度没有dplyr快)的方法进行数据整理

导入数据

library(tidyverse)
library(sqldf)
accounts <- read_csv("accounts.csv")
clients <- read_csv("clients.csv")
disp <- read_csv("disp.csv")
loans <- read_csv("loans.csv")
trans <- read_csv("trans.csv")

……………………………………………………

使用“loans”数据,根据“status”变量生成违约标识变量(bad_good),
其中当等于status=A时,#取值为bad_good=0,
其中当等于status in (B,D)时,取值为bad_good=1,
等于status=C时,bad_good为缺失值。*/
A代表合同终止,没问题;B代表合同终止,贷款没有支付;
C代表合同处于执行期,至今正常;D代表合同处于执行期,
欠债状态。

loans <- mutate(loans,bad_good=ifelse(status %in% c("B","D"),1,ifelse(status =="A",0,NA_integer_)))

********************************************************

是否违约与借款人的年龄是否有关系?
借款人的年龄信息在clients表中,所以要将两个表连接起来

#dplyr方法
dpl_data <- left_join(loans,disp,by=c("account_id")
          ) %>% left_join(y=clients,by=c("client_id")
                          ) %>% filter(type=="所有者") %>% select(-one_of(c("disp_id","type")))
#SQL方法
sql_data <- sqldf("select  a.*,c.sex,c.birth_date,c.district_id,
             c.client_id  from  loans as a
       left join disp as b on a.account_id=b.account_id
       left join clients as c on b.client_id=c.client_id
       where b.type='所有者'")
setequal(dpl_data,sql_data)
dpl_data <- mutate(dpl_data,age=floor((as.Date(dpl_data$date)-as.Date(dpl_data$birth_date))/365))
aggregate(dpl_data$age,by=list(dpl_data$bad_good),mean)
 Group.1         x
1       0 36.60591 
2       1 37.55263 

从图中可以简单看出年龄在不同好坏状态下的分布相差不大,相关性应该不大,不过还要进行假设检验才能得出结论

ggplot(dpl_data,mapping = aes(x=as.factor(bad_good),y=age))+geom_boxplot()
image.png

*****************************************************************

是否违约与借款人的资产否有关系?
因为账户余额在“trans”表中,所以要将loans表和trans表连接

dpl <- left_join(loans,trans,by="account_id",suffix = c("_x", "_y")) %>% arrange(account_id,date_y) %>% select(
  -one_of(c("trans_id","type","operation","amount_y","k_symbol","bank","account" ))
) %>% rename(date=date_x,amount=amount_x)
data2<-sqldf("select  a.*,b.balance,b.date as t_date  
              from  loans as a
              left join trans as b on a.account_id=b.account_id
              order by a.account_id,t_date")

dpl$date2 <- as.Date(dpl$date)
dpl$date2_y <- as.Date(dpl$date_y)
dpl$balance2 <- as.numeric(substr(gsub(",","",dpl$balance),2,nchar(dpl$balance)))

dpl <- select(dpl,-one_of("date","date_y","balance"))

只需要贷款前一年内的账户余额

dpl2 <- filter(dpl,date2>date2_y & date2<=date2_y+365)

dpl4<-select(dpl2,account_id,status,amount,balance2) %>% 
  group_by(account_id,amount,status) %>% 
  summarize(avg_balance=mean(balance2,na.rm = T),stdev_balance2=sd(balance2,na.rm = T))%>%
  arrange(account_id)
data4<-sqldf("select  a.account_id,a.status,a.amount,
              avg(balance2) as avg_balance,
             stdev(balance2) as stdev_balance2
             from dpl2 as a
             group by a.account_id
             order by a.account_id")
dpl4$bad_good<-ifelse(dpl4$status=="B" 
                       | dpl4$status=="D",1,
                       ifelse(dpl4$status=="A",0,NA))
dpl4$bad_good <- as.factor(dpl4$bad_good)

资产高低和违约的可能性是否有关系?
从图可以看出资产低的客人违约的可能性较高

with(dpl4,{
  aggregate(avg_balance,by=list(bad_good),mean)
})
Group.1        x
1       0 42499.40
2       1 34642.31
ggplot(dpl4,mapping = aes(x=bad_good,y=avg_balance))+geom_boxplot()
image.png

贷款数额高的客人违约的可能性较高

with(dpl4,{
  tapply(amount,bad_good,mean)
})
0         1 
 91641.46 205002.00 
ggplot(dpl4,mapping = aes(x=bad_good,y=amount))+geom_boxplot()
image.png

贷款额度超出资产越高的客人违约的可能性较高

dpl4 <- mutate(dpl4,rate=amount/avg_balance)
with(dpl4,{tapply(rate,bad_good,mean)})
ggplot(dpl4,mapping = aes(x=bad_good,y=rate))+geom_boxplot()
image.png

资产波动越高的客人违约的可能性较高,不过不明显

with(dpl4,{tapply(stdev_balance2,bad_good,mean)})
ggplot(dpl4,mapping = aes(x=bad_good,y=stdev_balance2))+geom_boxplot()
image.png

你可能感兴趣的:(简单的违约特征探索)