仍然使用facebook用户数据
ggplot(aes(x=gender,y=age),
data=subset(pf,!is.na(gender)))+ geom_boxplot() +
stat_summary(fun.y=mean,geom= 'point',shape= 4)
ggplot(aes(x = age,y= friend_count),
data = subset(pf,!is.na(gender)))+
geom_line(aes(color= gender),stat= 'summary',fun.y=median)
suppressMessages(library(dplyr))
suppressMessages(library(dplyr))
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age,gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n=n()) %>%
ungroup() %>%
arrange(age)
head(pf.fc_by_age_gender)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
ggplot(aes(x = age, y = median_friend_count),
data = pf.fc_by_age_gender) +
geom_line(aes(color = gender))
install.packages("tidyr")
library(tidyr)
spread(subset(pf.fc_by_age_gender,
select = c('gender', 'age', 'median_friend_count')),
gender, median_friend_count)
install.packages("reshape2")
library(reshape2)
pf.fc_by_age_gender.wide <- subset(pf.fc_by_age_gender[c('age', 'gender', 'median_friend_count')], !is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = male / female)
head(pf.fc_by_age_gender.wide)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)
> head(pf.fc_by_age_gender.wide)
age female male
1 13 148.0 55.0
2 14 224.0 92.5
3 15 276.0 106.5
4 16 258.5 136.0
5 17 245.5 125.0
ggplot(aes(x=age,y= female / male),
data =pf.fc_by_age_gender.wide) +
geom_line() +
geom_hline(yintercept = 1,alpha = 0.3,linetype=2)
pf$year_joined <- floor(2014 - pf$tenure/365)
summary(pf$year_joined)
table(pf$year_joined)
summary(pf$year_joined)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
2005 2012 2012 2012 2013 2014 2
> table(pf$year_joined)
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
9 15 581 1507 4557 5448 9860 33366 43588 70
pf$year_joined.bucket <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket,useNA = 'ifany')
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = median)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean) +
geom_line(stat='summary',fun.y= mean ,linetype=2)
将中位数化为虚线
总结好友率
with(subset(pf,tenure >=1),summary(friend_count / tenure))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
ggplot(aes(x=tenure,y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket))
ggplot(aes(x=tenure,y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket),
stat='summary',
fun.y=mean)
x = tenure
替换为 x = 7 * round(tenure / 7)
ggplot(aes(x= 7 * round(tenure / 7),y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket),
stat='summary',
fun.y=mean)
ggplot(aes(x= 30 * round(tenure / 30),y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket),
stat='summary',
fun.y=mean)
下面分析酸奶数据集
https://s3.amazonaws.com/udacity-hosted-downloads/ud651/yogurt.csv
首先选择id转换为因子变量
yo <- read.csv('yogurt.csv')
str(yo)
$ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
yo$id <- factor(yo$id)
str(yo)
$ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
qplot(data= yo ,x = price, fill = I ('#F79420'))
qplot(data= yo ,x = price, fill = I ('#F79420'),binwidth=10)
summary(yo)
unique(yo$price)
length(unique(yo$price))
summary(yo)
obs id time strawberry
Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
(Other):2061
blueberry pina.colada plain mixed.berry
Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
price
Min. :20.00
1st Qu.:50.00
Median :65.04
Mean :59.25
3rd Qu.:68.96
Max. :68.96
> length(unique(yo$price))
[1] 20
> unique(yo$price)
[1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36 55.04
[13] 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52
创建购买数的新变量,这里第一个方法使用了传递函数
yo <- transform(yo,all.purchases = strawberry+
blueberry+pina.colada+plain+mixed.berry)
或者
yo$all.purchases <- yo$strawberry+
yo$blueberry+yo$pina.colada+yo$plain+yo$mixed.berry
summary(yo$all.purchases)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 2.000 1.971 2.000 21.000
qplot(x = all.purchases,data = yo, binwidth = 1,
fill = I('#099DD9'))
ggplot(aes(x = time, y = price),data = yo) +
geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420'))
x %in% y
返回一个长度与 x 相同的逻辑(布尔)向量,该向量指出 x 中的每一个条目是否都出现在 y 中。也就是说,对于 x 中的每一个条目,该向量都会检查这一条目是否也出现在 y 中。
这样,我们就能将数据子集化,从而获得样本中住户的所有购买时机了。然后,我们通过样本 ID 创建价格与时间的散点图和分面。
在绘制散点时,使用 pch 或 shape 参数来指定符号
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price),
data = subset(yo, id %in% sample.ids))+
facet_wrap( ~ id ) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1 )
install.packages('GGally')
来安装包,以创建此特定的散点图矩阵。
如果图形需要很长时间才能呈现,或者如果你想查看散点图矩阵的其中一部分,你只需检查少量的变量。你可以使用以下代码或选择较少的变量。我们建议将性别(第 6 个变量)包含在内! pf_subset <- pf[ , c(2:7)]
你可能还会发现:变量标签是在散点图矩阵的外边缘上,而非对角线上。如果你希望标签在对角线上,你可以在 ggpairs 命令中设置 axisLabels = 'internal'
参数。
install.packages('GGally')
library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000), ])
下面使用基因数据集
https://s3.amazonaws.com/udacity-hosted-downloads/ud651/nci.tsv
library(reshape2)
nci <- read.table('nci.tsv')
colnames(nci) <- c(1:64)
nci.long.samp <- melt(as.matrix(nci[1:200, ]))
names(nci.long.samp) <- c('gene', 'case', 'value')
head(nci.long.samp)
ggplot (aes(y = gene, x = case, fill = value),
data = nci.long.samp ) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c('blue',
'red'))(100))