分析San Francisco的犯罪案件的模式。数据来自:
https://data.sfgov.org/Public-Safety/SFPD-Incidents-Previous-Three-Months/tmnf-yvry?
希望回答以下问题:
我们用dplyr来整理数据,ggplot2以及ggmap来进行数据可视化
1
|
require(dplyr)
|
1
|
require(ggplot2)
|
1
|
## Loading
required
package: ggplot2
|
1
|
require(ggmap)
|
1
|
## Loading
required
package: ggmap
|
1
|
require(ggthemes)
|
1
|
## Loading
required
package: ggthemes
|
先设置工作目录,读入数据。然后呢把日期格式化,同时呢我们把时间按照小时来分,不考虑分钟。
1
2
3
4
5
6
7
8
9
10
11
|
setwd(
"d:/project/datascience/teamleada")
#read in the data
crime=read.csv(
"./SFPD_Incidents_-_Previous_Three_Months.csv")
# format the data
crime$Location=
NULL
crime$IncidntNum=
NULL
crime$Date=as.Date(crime$Date,format=
"%m/%d/%Y")
crime$Hour=as.factor(substr(as.character(crime$Time),
1,
2))
# we will use dplyr package to do the work
crime.df=tbl_df(crime)
crime.df
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
## Source: local data frame [30,760 x 11]
##
## Category Descript DayOfWeek
## 1 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO Sunday
## 2 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Sunday
## 3 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Sunday
## 4 DRUG/NARCOTIC POSSESSION OF METH-AMPHETAMINE Sunday
## 5 DRUG/NARCOTIC POSSESSION OF COCAINE Sunday
## 6 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Sunday
## 7 WARRANTS WARRANT ARREST Sunday
## 8 VEHICLE THEFT STOLEN AUTOMOBILE Sunday
## 9 LARCENY/THEFT PETTY THEFT OF PROPERTY Sunday
## 10 ROBBERY ROBBERY OF A CHAIN STORE WITH BODILY FORCE Sunday
## .. ... ... ...
## Variables not shown: Date (date), Time (fctr), PdDistrict (fctr),
## Resolution (fctr), Address (fctr), X (dbl), Y (dbl), Hour (fctr)
|
这里主要有两个,因为在画直方图的时候,我们希望按count大小来排序,而不是按照数据中的factor变量的level来排序,所以写了一个辅助函数来对factor的lvel重新排序
1
2
3
4
|
order.level=
function(level.var,count) {
level.var=factor(level.var,levels =
levels(level.var)[order(count,decreasing=
T)])
}
|
另一个函数就是控制多个图形绘制时的排版,这个直接使用的R cookbook提供了函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
#
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <-
function(
..., plotlist=
NULL, file, cols=
1, layout=
NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(
...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(
1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==
1) {
print(plots[[
1]])
}
else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i
in
1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind =
TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
|
这里我们先只取VEHICLE THEFT进行分析,之后按照案件发生的区域分组统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
crime.df.vehicle=filter(crime.df,Category==
"VEHICLE THEFT")
crime.vehicle.district=crime.df.vehicle %>%
group_by(PdDistrict)%>%
summarise(count=n())
# PdDistrict变量按照count进行排序
crime.vehicle.district$PdDistrict=order.level(crime.vehicle.district$PdDistrict,
crime.vehicle.district$count)
p1=qplot(crime.vehicle.district$PdDistrict,data=crime.vehicle.district,
weight=crime.vehicle.district$count,geom=
"histogram",
xlab=
"District",ylab=
"Vehicle Theft Count")
# 经纬度
crime.df$lon=crime.df$X
crime.df$lat=crime.df$Y
# 获得San Francisco地图
SFMap <- qmap(
'San Francisco', zoom =
13, color =
'bw', legend =
'topleft')
|
1
2
3
4
|
#
# Map from URL : http:
//maps.googleapis.com/maps/api/staticmap?center=San+Francisco&zoom=13&size=%20640x640&scale=%202&maptype=terrain&sensor=false
#
# Google Maps API Terms of Service : http:
//developers.google.com/maps/terms
#
# Information from URL : http:
//maps.googleapis.com/maps/api/geocode/json?address=San+Francisco&sensor=false
#
# Google Maps API Terms of Service : http:
//developers.google.com/maps/terms
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# 绘制密度图
p2=SFMap+
geom_density2d(data=crime.df.vehicle,aes(group=
1))+
stat_density2d(data=crime.df.vehicle,aes(group=
1,fill=..level..,
alpha=..level..),
size=
0.01,bins=
16,geom=
'polygon')+
scale_fill_gradient(low=
"green",high=
"red")+
scale_alpha(range=c(
0.00,
0.25),guide=
FALSE)+
theme(legend.position=
"none",axis.title=element_blank(),
text=element_text(size=
12))+
ggtitle(
"Vehicle Theft")
multiplot(p1, p2, cols=
1)
|
1
|
## Loading
required
package: grid
|
1
|
## Error:
object
'lon'
not found
|
显然Ingleside,Mission,Bayview去案件更多,密度图也反映了相同信息
先按照每个区计算案件数目
1
2
3
4
5
6
|
crime.by.district=crime.df %>% group_by(PdDistrict)%>%summarise(count=n())
crime.by.district$PdDistrict=order.level(crime.by.district$PdDistrict,
crime.by.district$count)
p1=qplot(crime.by.district$PdDistrict,data=crime.by.district,
weight=crime.by.district$count,geom=
"histogram",
xlab=
"District",ylab=
"# of Crimes",main=
"# of Crimes by District")
|
根据每天来统计
1
2
3
4
5
6
|
crime.by.day=crime.df %>% group_by(DayOfWeek)%>%summarise(count=n())
crime.by.day$DayOfWeek=order.level(crime.by.day$DayOfWeek,
crime.by.day$count)
p2=qplot(crime.by.day$DayOfWeek,data=crime.by.day,
weight=crime.by.day$count,geom=
"histogram",
xlab=
"Day of Week",ylab=
"# of Crimes",main=
"# of Crimes by Day")
|
按照犯罪时间统计
1
2
3
4
5
6
7
8
|
crime.by.time=crime.df %>% group_by(Hour)%>%summarise(count=n())
crime.by.time$Hour=order.level(crime.by.time$Hour,
crime.by.time$count)
p3=qplot(crime.by.time$Hour,data=crime.by.time,
weight=crime.by.time$count,geom=
"histogram",
xlab=
"Time",ylab=
"# of Crimes",main=
"# of Crimes by Time")
multiplot(p1, p2,p3 ,cols=
2)
|
Richmond和Park区域是最安全的地方,案件什么少于了200.而案件发生的日期显然没有太明显规律,周一到周日基本都在4000左右。相反案件发生时间有高峰期和低点,18:00pm~19:00pm(2087),17:00~18:00pm(2022),19:00pm~20:00pm(1838)是典型的高峰,而04:00am~06:00am(<400)则是低点.
1
2
3
4
5
6
7
8
9
10
11
12
13
|
theft.filter=grep(
"THEFT",as.character(crime.df$Category))
crime.by.theft=crime.df[theft.filter,]
# get the crime by district and category
crime.by.group=crime.by.theft %>% group_by(PdDistrict,Category)%>%summarise(count=n())%>%arrange(Category,desc(count))
# sum the count by crime category
crime.by.number=crime.by.theft%>%group_by(Category)%>%summarise(count=n())%>%arrange(desc(count))
ggplot(crime.by.group,
aes(x=Category,fill=PdDistrict,weight=count))+
geom_histogram(position=
"fill",col=gray(
0.2))+
ylab(label=
"Ratio by District")+
theme_tufte()
|
larcen/theft 在Southern和 Central 区域更普遍,而vehicle theft则在Ingleside,Misson 和 Tenderloin区域更普遍。