1、读入数据

path <- "C:\\Users\\Admin\\Documents\\R\\ks\\"
file1 <- "CommViolPredUnnormalizedData.csv"
file2 <- "newdata.csv"

df1 <- read.csv(paste0(path, file1), header = T, stringsAsFactors = F)
pre <- read.csv(paste0(path, file2), header = T, stringsAsFactors = F)

2、导入tidyverse包

library(pacman)
p_load(tidyverse)

3、查看数据名称，并任选两个变量做出散点图

df1 <- tbl_df(df1)

# 查看数据名称
name <- names(df1)
name

##   [1] "communityname"         "state"                 "countyCode"           
##   [4] "communityCode"         "fold"                  "population"           
##   [7] "householdsize"         "racepctblack"          "racePctWhite"         
##  [10] "racePctAsian"          "racePctHisp"           "agePct12t21"          
##  [13] "agePct12t29"           "agePct16t24"           "agePct65up"           
##  [16] "numbUrban"             "pctUrban"              "medIncome"            
##  [19] "pctWWage"              "pctWFarmSelf"          "pctWInvInc"           
##  [22] "pctWSocSec"            "pctWPubAsst"           "pctWRetire"           
##  [25] "medFamInc"             "perCapInc"             "whitePerCap"          
##  [28] "blackPerCap"           "indianPerCap"          "AsianPerCap"          
##  [31] "OtherPerCap"           "HispPerCap"            "NumUnderPov"          
##  [34] "PctPopUnderPov"        "PctLess9thGrade"       "PctNotHSGrad"         
##  [37] "PctBSorMore"           "PctUnemployed"         "PctEmploy"            
##  [40] "PctEmplManu"           "PctEmplProfServ"       "PctOccupManu"         
##  [43] "PctOccupMgmtProf"      "MalePctDivorce"        "MalePctNevMarr"       
##  [46] "FemalePctDiv"          "TotalPctDiv"           "PersPerFam"           
##  [49] "PctFam2Par"            "PctKids2Par"           "PctYoungKids2Par"     
##  [52] "PctTeen2Par"           "PctWorkMomYoungKids"   "PctWorkMom"           
##  [55] "NumKidsBornNeverMar"   "PctKidsBornNeverMar"   "NumImmig"             
##  [58] "PctImmigRecent"        "PctImmigRec5"          "PctImmigRec8"         
##  [61] "PctImmigRec10"         "PctRecentImmig"        "PctRecImmig5"         
##  [64] "PctRecImmig8"          "PctRecImmig10"         "PctSpeakEnglOnly"     
##  [67] "PctNotSpeakEnglWell"   "PctLargHouseFam"       "PctLargHouseOccup"    
##  [70] "PersPerOccupHous"      "PersPerOwnOccHous"     "PersPerRentOccHous"   
##  [73] "PctPersOwnOccup"       "PctPersDenseHous"      "PctHousLess3BR"       
##  [76] "MedNumBR"              "HousVacant"            "PctHousOccup"         
##  [79] "PctHousOwnOcc"         "PctVacantBoarded"      "PctVacMore6Mos"       
##  [82] "MedYrHousBuilt"        "PctHousNoPhone"        "PctWOFullPlumb"       
##  [85] "OwnOccLowQuart"        "OwnOccMedVal"          "OwnOccHiQuart"        
##  [88] "OwnOccQrange"          "RentLowQ"              "RentMedian"           
##  [91] "RentHighQ"             "RentQrange"            "MedRent"              
##  [94] "MedRentPctHousInc"     "MedOwnCostPctInc"      "MedOwnCostPctIncNoMtg"
##  [97] "NumInShelters"         "NumStreet"             "PctForeignBorn"       
## [100] "PctBornSameState"      "PctSameHouse85"        "PctSameCity85"        
## [103] "PctSameState85"        "LemasSwornFT"          "LemasSwFTPerPop"      
## [106] "LemasSwFTFieldOps"     "LemasSwFTFieldPerPop"  "LemasTotalReq"        
## [109] "LemasTotReqPerPop"     "PolicReqPerOffic"      "PolicPerPop"          
## [112] "RacialMatchCommPol"    "PctPolicWhite"         "PctPolicBlack"        
## [115] "PctPolicHisp"          "PctPolicAsian"         "PctPolicMinor"        
## [118] "OfficAssgnDrugUnits"   "NumKindsDrugsSeiz"     "PolicAveOTWorked"     
## [121] "LandArea"              "PopDens"               "PctUsePubTrans"       
## [124] "PolicCars"             "PolicOperBudg"         "LemasPctPolicOnPatr"  
## [127] "LemasGangUnitDeploy"   "LemasPctOfficDrugUn"   "PolicBudgPerPop"      
## [130] "murders"               "murdPerPop"            "rapes"                
## [133] "rapesPerPop"           "robberies"             "robbbPerPop"          
## [136] "assaults"              "assaultPerPop"         "burglaries"           
## [139] "burglPerPop"           "larcenies"             "larcPerPop"           
## [142] "autoTheft"             "autoTheftPerPop"       "arsons"               
## [145] "arsonsPerPop"          "ViolentCrimesPerPop"   "nonViolPerPop"

# 查看数据类型
str(df1)

## Classes 'tbl_df', 'tbl' and 'data.frame':    2214 obs. of  147 variables:
##  $ communityname        : chr  "BerkeleyHeightstownship" "Marpletownship" "Tigardcity" "Gloversvillecity" ...
##  $ state                : chr  "NJ" "PA" "OR" "NY" ...
##  $ countyCode           : chr  "39" "45" "?" "35" ...
##  $ communityCode        : chr  "5320" "47616" "?" "29443" ...
##  $ fold                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population           : int  11980 23123 29344 16656 11245 140494 28700 59459 74111 103590 ...
##  $ householdsize        : num  3.1 2.82 2.43 2.4 2.76 2.45 2.6 2.45 2.46 2.62 ...
##  $ racepctblack         : num  1.37 0.8 0.74 1.7 0.53 ...
##  $ racePctWhite         : num  91.8 95.6 94.3 97.3 89.2 ...
##  $ racePctAsian         : num  6.5 3.44 3.43 0.5 1.17 0.9 1.47 0.4 1.25 0.92 ...
##  $ racePctHisp          : num  1.88 0.85 2.35 0.7 0.52 ...
##  $ agePct12t21          : num  12.5 11 11.4 12.6 24.5 ...
##  $ agePct12t29          : num  21.4 21.3 25.9 25.2 40.5 ...
##  $ agePct16t24          : num  10.9 10.5 11 12.2 28.7 ...
##  $ agePct65up           : num  11.3 17.2 10.3 17.6 12.6 ...
##  $ numbUrban            : int  11980 23123 29344 0 0 140494 28700 59449 74115 103590 ...
##  $ pctUrban             : num  100 100 100 0 0 100 100 100 100 100 ...
##  $ medIncome            : int  75122 47917 35669 20580 17390 21577 42805 23221 25326 17852 ...
##  $ pctWWage             : num  89.2 79 82 68.2 69.3 ...
##  $ pctWFarmSelf         : num  1.55 1.11 1.15 0.24 0.55 1 0.39 0.67 2.93 0.86 ...
##  $ pctWInvInc           : num  70.2 64.1 55.7 39 42.8 ...
##  $ pctWSocSec           : num  23.6 35.5 22.2 39.5 32.2 ...
##  $ pctWPubAsst          : num  1.03 2.75 2.94 11.71 11.21 ...
##  $ pctWRetire           : num  18.4 22.9 14.6 18.3 14.4 ...
##  $ medFamInc            : int  79584 55323 42112 26501 24018 27705 50394 28901 34269 24058 ...
##  $ perCapInc            : int  29711 20148 16946 10810 8483 11878 18193 12161 13554 10195 ...
##  $ whitePerCap          : int  30233 20191 17103 10909 9009 12029 18276 12599 13727 12126 ...
##  $ blackPerCap          : int  13600 18137 16644 9984 887 7382 17342 9820 8852 5715 ...
##  $ indianPerCap         : int  5725 0 21606 4941 4425 10264 21482 6634 5344 11313 ...
##  $ AsianPerCap          : int  27101 20074 15528 3541 3352 10753 12639 8802 8011 5770 ...
##  $ OtherPerCap          : chr  "5115" "5250" "5954" "2451" ...
##  $ HispPerCap           : int  22838 12222 8405 4391 1328 8104 22594 6187 5174 6984 ...
##  $ NumUnderPov          : int  227 885 1389 2831 2855 23223 1126 10320 9603 27767 ...
##  $ PctPopUnderPov       : num  1.96 3.98 4.75 17.23 29.99 ...
##  $ PctLess9thGrade      : num  5.81 5.61 2.8 11.05 12.15 ...
##  $ PctNotHSGrad         : num  9.9 13.72 9.09 33.68 23.06 ...
##  $ PctBSorMore          : num  48.2 29.9 30.1 10.8 25.3 ...
##  $ PctUnemployed        : num  2.7 2.43 4.01 9.86 9.08 5.72 4.85 8.19 4.18 8.39 ...
##  $ PctEmploy            : num  64.5 62 69.8 54.7 52.4 ...
##  $ PctEmplManu          : num  14.65 12.26 15.95 31.22 6.89 ...
##  $ PctEmplProfServ      : num  28.8 29.3 21.5 27.4 36.5 ...
##  $ PctOccupManu         : num  5.49 6.39 8.79 26.76 10.94 ...
##  $ PctOccupMgmtProf     : num  50.7 37.6 32.5 22.7 27.8 ...
##  $ MalePctDivorce       : num  3.67 4.23 10.1 10.98 7.51 ...
##  $ MalePctNevMarr       : num  26.4 28 25.8 28.1 50.7 ...
##  $ FemalePctDiv         : num  5.22 6.45 14.76 14.47 11.64 ...
##  $ TotalPctDiv          : num  4.47 5.42 12.55 12.91 9.73 ...
##  $ PersPerFam           : num  3.22 3.11 2.95 2.98 2.98 2.89 3.14 2.95 3 3.11 ...
##  $ PctFam2Par           : num  91.4 86.9 78.5 64 58.6 ...
##  $ PctKids2Par          : num  90.2 85.3 78.8 62.4 55.2 ...
##  $ PctYoungKids2Par     : num  95.8 96.8 92.4 65.4 66.5 ...
##  $ PctTeen2Par          : num  95.8 86.5 75.7 67.4 79.2 ...
##  $ PctWorkMomYoungKids  : num  44.6 51.1 66.1 59.6 61.2 ...
##  $ PctWorkMom           : num  58.9 62.4 74.2 70.3 68.9 ...
##  $ NumKidsBornNeverMar  : int  31 43 164 561 402 1511 263 2368 751 3537 ...
##  $ PctKidsBornNeverMar  : num  0.36 0.24 0.88 3.84 4.7 1.58 1.18 4.66 1.64 4.71 ...
##  $ NumImmig             : int  1277 1920 1468 339 196 2091 2637 517 1474 4793 ...
##  $ PctImmigRecent       : num  8.69 5.21 16.42 13.86 46.94 ...
##  $ PctImmigRec5         : num  13 8.65 23.98 13.86 56.12 ...
##  $ PctImmigRec8         : num  21 13.3 32.1 15.3 67.9 ...
##  $ PctImmigRec10        : num  30.9 22.5 35.6 15.3 69.9 ...
##  $ PctRecentImmig       : num  0.93 0.43 0.82 0.28 0.82 0.32 1.05 0.11 0.47 0.72 ...
##  $ PctRecImmig5         : num  1.39 0.72 1.2 0.28 0.98 0.45 1.49 0.2 0.67 1.07 ...
##  $ PctRecImmig8         : num  2.24 1.11 1.61 0.31 1.18 0.57 2.2 0.25 0.93 1.63 ...
##  $ PctRecImmig10        : num  3.3 1.87 1.78 0.31 1.22 0.68 2.55 0.29 1.07 2.31 ...
##  $ PctSpeakEnglOnly     : num  85.7 87.8 93.1 95 94.6 ...
##  $ PctNotSpeakEnglWell  : num  1.37 1.81 1.14 0.56 0.39 0.6 0.6 0.28 0.43 2.51 ...
##  $ PctLargHouseFam      : num  4.81 4.25 2.97 3.93 5.23 3.08 5.08 3.85 2.59 6.7 ...
##  $ PctLargHouseOccup    : num  4.17 3.34 2.05 2.56 3.11 1.92 3.46 2.55 1.54 4.1 ...
##  $ PersPerOccupHous     : num  2.99 2.7 2.42 2.37 2.35 2.28 2.55 2.36 2.32 2.45 ...
##  $ PersPerOwnOccHous    : num  3 2.83 2.69 2.51 2.55 2.37 2.89 2.42 2.77 2.47 ...
##  $ PersPerRentOccHous   : num  2.84 1.96 2.06 2.2 2.12 2.16 2.09 2.27 1.91 2.44 ...
##  $ PctPersOwnOccup      : num  91.5 89 64.2 58.2 58.1 ...
##  $ PctPersDenseHous     : num  0.39 1.01 2.03 1.21 2.94 2.11 1.47 1.9 1.67 6.14 ...
##  $ PctHousLess3BR       : num  11.1 23.6 47.5 45.7 55.6 ...
##  $ MedNumBR             : int  3 3 3 3 2 2 3 2 2 2 ...
##  $ HousVacant           : int  64 240 544 669 333 5119 566 2051 1562 5606 ...
##  $ PctHousOccup         : num  98.4 97.2 95.7 91.2 92.5 ...
##  $ PctHousOwnOcc        : num  91 84.9 57.8 54.9 53.6 ...
##  $ PctVacantBoarded     : num  3.12 0 0.92 2.54 3.9 2.09 1.41 6.39 0.45 5.64 ...
##  $ PctVacMore6Mos       : num  37.5 18.33 7.54 57.85 42.64 ...
##  $ MedYrHousBuilt       : int  1959 1958 1976 1939 1958 1966 1956 1954 1971 1960 ...
##  $ PctHousNoPhone       : num  0 0.31 1.55 7 7.45 ...
##  $ PctWOFullPlumb       : num  0.28 0.14 0.12 0.87 0.82 0.31 0.28 0.49 0.19 0.33 ...
##  $ OwnOccLowQuart       : int  215900 136300 74700 36400 30600 37700 155100 26300 54500 28600 ...
##  $ OwnOccMedVal         : int  262600 164200 90400 49600 43200 53900 179000 37000 70300 43100 ...
##  $ OwnOccHiQuart        : int  326900 199900 112000 66500 59500 73100 215500 52400 93700 67400 ...
##  $ OwnOccQrange         : int  111000 63600 37300 30100 28900 35400 60400 26100 39200 38800 ...
##  $ RentLowQ             : int  685 467 370 195 202 215 463 186 241 192 ...
##  $ RentMedian           : int  1001 560 428 250 283 280 669 253 321 281 ...
##  $ RentHighQ            : int  1001 672 520 309 362 349 824 325 387 369 ...
##  $ RentQrange           : int  316 205 150 114 160 134 361 139 146 177 ...
##  $ MedRent              : int  1001 627 484 333 332 340 736 338 355 353 ...
##  $ MedRentPctHousInc    : num  23.8 27.6 24.1 28.7 32.2 26.4 24.4 26.3 25.2 29.6 ...
##  $ MedOwnCostPctInc     : num  21.1 20.7 21.7 20.6 23.2 17.3 20.8 15.1 20.7 19.4 ...
##  $ MedOwnCostPctIncNoMtg: num  14 12.5 11.6 14.5 12.9 11.7 12.5 12.2 12.8 13 ...
##  $ NumInShelters        : int  11 0 16 0 2 327 0 21 125 43 ...
##  $ NumStreet            : int  0 0 0 0 0 4 0 0 15 4 ...
##  $ PctForeignBorn       : num  10.66 8.3 5 2.04 1.74 ...
##   [list output truncated]

# 选取NumUnderPov和population画散点图
ggplot(df1,aes(log10(NumUnderPov),log10(population))) +
  geom_point() +
  geom_smooth(method = "lm",se=T) +
  theme_bw() +
  labs(title = "NumUnderPov与population散点图") +
  theme(plot.title = element_text(hjust = 0.5))

散点图

从整体上来看，NumUnderPov与population之间成正相关关系。

4、找出countyCode和communityCode的缺失值，并删除相应行

nrow(df1)

## [1] 2214

df2 <- df1 %>% filter(countyCode != "?" & communityCode != "?")
nrow(df2)

## [1] 991

5、找出population中的极端值，并滤掉相应的行

# 画出population的箱线图
ggplot(df2,aes(y=population)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 8) +
  coord_flip() +
  labs(title = "population箱线图") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

箱线图

通过箱线图可以看到明显的极端值，因为数据的极差较大，箱线图无法在数据中准确定位极端值。

# 查看数据的最小值和最大值
rng <- range(df2$population)
rng

## [1]   10034 2783726

# 输出异常值列表
out <- boxplot.stats(df2$population)$out

# 过滤掉极端值所在的行
df3 <- filter(df2, population < min(out))

6、以medIncome的自然对数作为因变量，PctUnemployed为自变量回归

fit <- lm(log(medIncome) ~ PctUnemployed, df3)
summary(fit)

## 
## Call:
## lm(formula = log(medIncome) ~ PctUnemployed, data = df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8150 -0.1843  0.0223  0.1788  0.9100 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   10.97136    0.01995     550 <0.0000000000000002 ***
## PctUnemployed -0.08616    0.00331     -26 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.26 on 886 degrees of freedom
## Multiple R-squared:  0.433,  Adjusted R-squared:  0.432 
## F-statistic:  676 on 1 and 886 DF,  p-value: <0.0000000000000002

回归方程为：log(medIncome) = -0.08616 * PctUnemployed + 10.97136
自变量系数P值<2e-16<0.05，所以是在0.05水平上显著。

7、创建新变量highRobPerPop，并选择合适的变量回归

7.1 缺失值处理

# 删除'communityname'，'state'，'countyCode'，'communityCode'变量
df4 <- df3[, -(1:4)]

# 将其他字符型变量转换为数值型
df4 <- df4 %>% purrr::map_if(is.character, as.numeric) %>% tbl_df()

# 计算总缺失值数量
sum(is.na(df4))

## [1] 18861

字符型转换为数值型后，存在18861个缺失值，需要先对缺失值进行处理。
检查缺失值的分布：

p_load(mice)
# 返回数据集中每个变量中缺失值的统计表格
mc <- md.pattern(df4, plot = F)
dim(mc)

## [1]  10 144

# 只查看最终统计行，并转换为数据框
mc <- mc[10, ]
sum.na <- tibble(var = names(mc), sum.na = mc)

# 按缺失值数量排序查看相关变量
n <- filter(sum.na, sum.na > 0) %>% arrange(-sum.na)
print(n)

## # A tibble: 33 x 2
##    var                    sum.na
##                       
##  1 ""                      18861
##  2 "LemasSwornFT"            832
##  3 "LemasSwFTPerPop"         832
##  4 "LemasSwFTFieldOps"       832
##  5 "LemasSwFTFieldPerPop"    832
##  6 "LemasTotalReq"           832
##  7 "LemasTotReqPerPop"       832
##  8 "PolicReqPerOffic"        832
##  9 "PolicPerPop"             832
## 10 "RacialMatchCommPol"      832
## # ... with 23 more rows

# 缺失值可视化
ggplot(n[-1,],aes(sum.na,var)) +
  geom_bar(stat = "identity") +
  labs(x="缺失值数量",y="") +
  theme_bw()

缺失值数量

删除包含100个以上缺失值的变量。

nms <- filter(sum.na, sum.na > 100)
df4 <- df4[is.na(match(names(df4), nms$var))]

创建新变量highRobPerPop，robbbPerPop值高于平均值的为1，低于平均值的为0，作为回归的响应变量。

med <- mean(df4$robbbPerPop)
df5 <- df4 %>% mutate(highRobPerPop = as.factor(if_else(robbbPerPop > med, 1, 0))) %>% 
    select(-robbbPerPop)

# 查看相应变量的数据分布
table(df5$highRobPerPop)

## 
##   0   1 
## 656 232

使用均值填充剩下的缺失值：

imputed.data <- mice(df5,
                     # 估算数据集的个数
                     m = 1,
                     # 估算缺失值的迭代次数
                     maxit = 10,
                     # 使用预测平均值匹配法
                     method = "pmm",
                     # 随机数种子
                     seed = 123)

# 使用估算后的数据集
df5 <- complete(imputed.data, 1)
sum(is.na(df5))

## [1] 0

7.2 特征选择

p_load(Boruta)
set.seed(123)
fea.sec <- Boruta(highRobPerPop ~ .,data = df5,
                  # 默认100，最大随机森林运行次数。如果保留待定属性，可以考虑增加此参数
                  maxRuns = 100,
                  # 0代表没有追踪。1表示报告属性决策。 2表示1加上额外报告每个迭代。默认值为0
                  doTrace=2,
                  # TRUE (默认值) 则存储重要性运行的完整历史记
                  holdHistory = T)
fea.sec

## Boruta performed 99 iterations in 4.2 mins.
##  56 attributes confirmed important: arsons, arsonsPerPop,
## assaultPerPop, assaults, autoTheft and 51 more;
##  33 attributes confirmed unimportant: agePct12t21, agePct12t29,
## agePct16t24, agePct65up, AsianPerCap and 28 more;
##  28 tentative attributes left: HousVacant, medFamInc, medIncome,
## MedOwnCostPctInc, MedOwnCostPctIncNoMtg and 23 more;

在我的笔记本上运行了4.2 mins，99次迭代，56个变量确认了重要性，33个确认了不重要，还有28个变量的重要性待定。
再次确认待定的变量：

final.boruta <- TentativeRoughFix(fea.sec)
final.boruta

## Boruta performed 99 iterations in 4.2 mins.
## Tentatives roughfixed over the last 99 iterations.
##  63 attributes confirmed important: arsons, arsonsPerPop,
## assaultPerPop, assaults, autoTheft and 58 more;
##  54 attributes confirmed unimportant: agePct12t21, agePct12t29,
## agePct16t24, agePct65up, AsianPerCap and 49 more;

最终有63个变量确认了重要性，而另外54个确认不重要。
选择确认的变量：

fea.names <- getSelectedAttributes(final.boruta, withTentative = F)
fea.names

##  [1] "population"          "racepctblack"        "racePctWhite"       
##  [4] "racePctHisp"         "numbUrban"           "pctUrban"           
##  [7] "pctWWage"            "pctWFarmSelf"        "pctWInvInc"         
## [10] "pctWPubAsst"         "blackPerCap"         "NumUnderPov"        
## [13] "PctPopUnderPov"      "PctLess9thGrade"     "PctNotHSGrad"       
## [16] "PctBSorMore"         "PctOccupMgmtProf"    "MalePctDivorce"     
## [19] "MalePctNevMarr"      "FemalePctDiv"        "TotalPctDiv"        
## [22] "PctFam2Par"          "PctKids2Par"         "PctYoungKids2Par"   
## [25] "PctTeen2Par"         "NumKidsBornNeverMar" "PctKidsBornNeverMar"
## [28] "NumImmig"            "PctRecImmig5"        "PctRecImmig8"       
## [31] "PctRecImmig10"       "PctLargHouseFam"     "PctLargHouseOccup"  
## [34] "PctPersOwnOccup"     "PctPersDenseHous"    "PctHousLess3BR"     
## [37] "PctHousOwnOcc"       "PctVacantBoarded"    "PctHousNoPhone"     
## [40] "OwnOccMedVal"        "OwnOccHiQuart"       "RentMedian"         
## [43] "MedRent"             "MedOwnCostPctInc"    "PctForeignBorn"     
## [46] "LandArea"            "PopDens"             "PctUsePubTrans"     
## [49] "LemasPctOfficDrugUn" "murders"             "murdPerPop"         
## [52] "robberies"           "assaults"            "assaultPerPop"      
## [55] "burglaries"          "burglPerPop"         "larcenies"          
## [58] "larcPerPop"          "autoTheft"           "autoTheftPerPop"    
## [61] "arsons"              "arsonsPerPop"        "nonViolPerPop"

# 创建最终结果的数据框
df6 <- df5[, fea.names]
dim(df6)

## [1] 888  63

7.3 数据标准化，并加入响应变量

df7 <- scale(df6) %>% as_tibble() %>% mutate(highRobPerPop = df5$highRobPerPop)

7.4 将数据集拆分为训练集和测试集

set.seed(0)

ind <- sample(2, nrow(df7), replace = T, prob = c(0.7, 0.3))
train <- df7[ind == 1, ]
test <- df7[ind == 2, ]

dim(train)

## [1] 613  64

dim(test)

## [1] 275  64

7.5 构建随机森林模型

p_load(mlr)
task <- makeClassifTask(id = "df7", data = train, target = "highRobPerPop")

# 创建一个重抽样对象来为随机森林模型调整树的数量
rdesc <- makeResampleDesc("Subsample", iters = 3)

# 建立一个树的网格来调整树的数量
param <- makeParamSet(makeDiscreteParam("ntree", values = c(750, 770, 790, 810, 
    1750, 1770, 1790, 1810, 1830)))

# 创建控制对象，建立数值网格
ctrl <- makeTuneControlGrid()

# 调出最优树数量和相应的样本外误差
tuning <- tuneParams("classif.randomForest", task = task, resampling = rdesc, par.set = param, 
    control = ctrl)
tuning$x

## $ntree
## [1] 790

tuning$y

## mmce.test.mean 
##          0.093

得到最优树的数量为790。使用最优树数量训练模型。

rf <- setHyperPars(makeLearner("classif.randomForest", predict.type = "prob", par.vals = tuning$x))

# 训练模型
fit.rf <- train(rf, task)

# 查看训练集上的混淆矩阵
fit.rf$learner.model

## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 790) 
##                Type of random forest: classification
##                      Number of trees: 790
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 7.7%
## Confusion matrix:
##     0   1 class.error
## 0 428  17       0.038
## 1  30 138       0.179

可以看到模型在训练集上的袋外估计错误率为7.7%。

7.6 模型在测试集上的准确率

# 看看在测试集上的表现
pred.rf <- predict(fit.rf, newdata = test)
calculateConfusionMatrix(pred.rf)

##         predicted
## true       0  1 -err.-
##   0      202  9      9
##   1       12 52     12
##   -err.-  12  9     21

performance(pred.rf, measures = list(mmce, acc, auc))

##  mmce   acc   auc 
## 0.076 0.924 0.970

模型在测试集上的表现与在训练集上差不多。

7.7 预测结果

pre <- pre[, fea.names]
pred.pre <- predict(fit.rf, type = "class", newdata = pre)
pred.pre

## Prediction: 1 observations
## predict.type: prob
## threshold: 0=0.50,1=0.50
## time: 0.00
##   prob.0 prob.1 response
## 1   0.27   0.73        1

输出结果为1（response=1）。即该条记录highRobPerPop=1，该地区robbbPerPop高于平均值。

8、抽取样本

df.coef <- tibble()

df8 <- select(df3, c("medIncome", "PctUnemployed"))

# 确定有无缺失值
sum(is.na(df8))

## [1] 0

for (i in 1:3000) {
    smp <- df8[sample(nrow(df8), 20, replace = T), ]
    fit <- lm(log(medIncome) ~ PctUnemployed, smp)
    df.c <- coef(fit)
    df.coef <- bind_rows(df.coef, df.c)
}

names(df.coef) <- c("incept", "coef")
head(df.coef)

## # A tibble: 6 x 2
##   incept    coef
##       
## 1   10.9 -0.0752
## 2   10.8 -0.0667
## 3   10.9 -0.0751
## 4   11.1 -0.0970
## 5   10.8 -0.0587
## 6   11.0 -0.0884

ggplot(df.coef, aes(coef), binwidth = 1) + geom_histogram(bins = 100, fill = "dodgerblue") + 
    geom_vline(xintercept = mean(coef, na.rm = T), col = "red", size = 1, linetype = 2) + 
    theme_bw() + theme(plot.title = element_text(hjust = 0.5))

系数项分布直方图

系数值大体呈正态分布，理论基础：中心极限定理。

48-R语言考试题