作者 Selva Prabhakaran
# 给数据集插入离群值
cars1 <- cars[1:30, ] # 原始数据
cars_outliers <- data.frame(speed = c(19, 19, 20, 20, 20),
dist = c(190, 186, 210, 220, 218)) # 引入离群值
cars2 <- rbind(cars1, cars_outliers) # 包含李全职的数据
# 绘制包含离群值的数据建模结果
par(mfrow = c(1, 2))
plot(cars2$speed, cars2$dist, xlim = c(0, 28), ylim=c(0, 230),
main = "With Outliers", xlab = "speed", ylab = "dist",
pch = "*", col = "red", cex = 2)
abline(lm(dist ~ speed, data = cars2), col = "blue", lwd = 3, lty = 2)
# 绘制原始数据建模加过,留意回归线斜率的变化
plot(cars1$speed, cars1$dist, xlim = c(0, 28), ylim = c(0, 230),
main = "Outliers removed
A much better fit!",
xlab = "speed", ylab = "dist", pch = "*", col = "red", cex = 2)
abline(lm(dist ~ speed, data = cars1), col = "blue", lwd = 3, lty = 2)
url <- "http://rstatistics.net/wp-content/uploads/2015/09/ozone.csv"
# 备用数据源: https://raw.githubusercontent.com/selva86/datasets/master/ozone.csv
inputData <- read.csv(url) # 导入数据
outlier_values <- boxplot.stats(inputData$pressure_height)$out # outlier values.
boxplot(inputData$pressure_height, main="Pressure Height", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
2. 双变量检测法
url <- "http://rstatistics.net/wp-content/uploads/2015/09/ozone.csv"
ozone <- read.csv(url)
# Month和Day_of_Week是分类变量
boxplot(ozone_reading ~ Month, data=ozone,
main="Ozone reading across months") # 有明确的模式
boxplot(ozone_reading ~ Day_of_week, data=ozone,
main="Ozone reading for days of week")
箱线图如下:
boxplot(ozone_reading ~ pressure_height, data=ozone,
main="Boxplot for Pressure height (continuos var)
vs Ozone")
boxplot(ozone_reading ~ cut(pressure_height,
pretty(inputData$pressure_height)),
data=ozone, main="Boxplot for Pressure height (categorial) vs Ozone", cex.axis=0.5)
结果如下
mod <- lm(ozone_reading ~ ., data=ozone)
cooksd <- cooks.distance(mod)
影响评估
plot(cooksd, pch="*", cex=2, main="Influential Obs
by Cooks distance") # 绘制Cook距离
abline(h = 4*mean(cooksd, na.rm=T), col="red")
text(x=1:length(cooksd)+1, y=cooksd,
labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # 添加标签
influential 4*mean(cooksd, na.rm=T))]) # 有影响力的观测值行标
head(ozone[influential, ]) # 列出这些观测
#> Month Day_of_month Day_of_week ozone_reading pressure_height Wind_speed Humidity
#> 19 1 19 1 4.07 5680 5 73
#> 23 1 23 5 4.90 5700 5 59
#> 58 2 27 5 22.89 5740 3 47
#> 133 5 12 3 33.04 5880 3 80
#> 135 5 14 5 31.15 5850 4 76
#> 149 5 28 5 4.82 5750 3 76
#> Temperature_Sandburg Temperature_ElMonte Inversion_base_height Pressure_gradient
#> 19 52 56.48 393 -68
#> 23 69 51.08 3044 18
#> 58 53 58.82 885 -4
#> 133 80 73.04 436 0
#> 135 78 71.24 1181 50
#> 149 65 51.08 3644 86
#> Inversion_temperature Visibility
#> 19 69.80 10
#> 23 52.88 150
#> 58 67.10 80
#> 133 86.36 40
#> 135 79.88 17
#> 149 59.36 70
让我们看看前6个观测来看看为什么这些观测富有影响力吧。
第58, 133, 135行的ozone_reading值非常大
第23, 135, 149行的Inversion_bzase_height值非常大
第19行有非常低的Pressure_gradient
car::outlierTest(mod)
#> No Studentized residuals with Bonferonni p Largest |rstudent|:
#> rstudent unadjusted p-value Bonferonni p
#> 243 3.045756 0.0026525 0.53845
0utliners包
set.seed(1234)
y=rnorm(100)
outlier(y)
#> [1] 2.548991
outlier(y,opposite=TRUE)
#> [1] -2.345698
dim(y) <- c(20,5) # convert it to a matrix
outlier(y)
#> [1] 2.415835 1.102298 1.647817 2.548991 2.121117
outlier(y,opposite=TRUE)
#> [1] -2.345698 -2.180040 -1.806031 -1.390701 -1.372302
scores()函数有两大功能。一是计算规范化得分,诸如z得分,t得分,chisq得分等。它还可以基于上述的得分值,返回那些得分在相应分布百分位数之外的观测值。
set.seed(1234)
x = rnorm(10)
scores(x) # z得分 => (x-mean)/sd
scores(x, type="chisq") # chisq得分 => (x - mean(x))^2/var(x)
#> [1] 0.68458034 0.44007451 2.17210689 3.88421971 0.66539631 . . .
scores(x, type="t") # t得分
scores(x, type="chisq", prob=0.9) # 是否超过chisq分布的0.9分位数
#> [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
scores(x, type="chisq", prob=0.95) # 0.95分位数
scores(x, type="z", prob=0.95) # 基于z得分判定
scores(x, type="t", prob=0.95) #
离群值处理
x <- ozone$pressure_height
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] (qnt[2] + H)] <- caps[2]
译者注:该方法和数据预处理中的缩尾(winsorize)处理基本一致,和数理统计中的m统计量思想也类似。
—————————————
往期精彩:
最担心的事情终于发生了,APP已经可以一键“脱掉”你的衣服了
华为延期,三星下架,讲讲折叠屏为什么这么难
李彦宏被泼水,是“多数人的暴力”还是“群众的宣泄”