首先我是这是我日常逛 twitter 看到的,然后我又是一个搬运工( emmm,这个系列都没有询问作者意见,后面如果有意外就删了。
)
emmm, 后面要好好学习一下 igraph 和 ggraph 的骚操作,虽然图出来了,但是可视化那部分我暂时还是没看懂。所以这里就只是学习了数据清洗操作。我的妈诶,再次强烈推荐 《 R for data science 》,你看这里面哪一个操作是脱离这本书的函数操作。
- 书籍在线版:R for Data Science
- 习题链接:R for Data Science: Exercise Solutions
参考链接:
https://github.com/spren9er/tidytuesday/blob/master/tidytuesday_201946_cran_packages.r
knitr::opts_chunk$set(echo = TRUE)
加载包
library(tidyverse)
library(igraph)
library(ggraph)
读取数据
## 数据存放的网页链接
path <-
paste0(
'https://raw.githubusercontent.com/rfordatascience/tidytuesday/',
'master/data/2019/2019-11-12/'
)
## 读取在线数据,这时候 `path` 就相当于本地的路径;我们也可以下载到本地然后读取
data <- read_csv(paste0(path, 'loc_cran_packages.csv'))
## data 数据类型
# Parsed with column specification:
# cols(
# file = col_double(),
# language = col_character(),
# blank = col_double(),
# comment = col_double(),
# code = col_double(),
# pkg_name = col_character(),
# version = col_character()
# )
head(data)
# A tibble: 6 x 7
file language blank comment code pkg_name version
1 2 R 96 353 365 A3 1.0.0
2 1 HTML 347 5 2661 aaSEA 1.0.0
3 23 R 63 325 676 aaSEA 1.0.0
4 3 HTML 307 9 1275 abbyyR 0.5.5
5 30 R 224 636 587 abbyyR 0.5.5
6 5 Markdown 246 0 418 abbyyR 0.5.5
数据清洗
- Tiobe index 链接
# popular programming languages from Tiobe Index (Nov. 2019)
# Tiobe index 中 2019 年 十一 月为止流行的编程语言
popular_languages <- c(
'Java', 'C', 'Python', 'C++', 'C#', 'Visual Basic', 'JavaScript', 'PHP', 'SQL', 'Ruby', 'Objective C++', 'Assembly', 'R'
)
number_of_pkgs <- 300
# 给每种编程语言定义对应的颜色
top_language_colors <- list(
'Assembly' = '#efb306',
'C' = '#eb990c',
'C++' = '#e8351e',
'JavaScript' = '#852f88',
'Java' = '#cd023d',
'R' = '#7db954',
'Python' = '#0f8096',
'Ruby' = '#4e54ac',
'SQL' = '#17a769',
'All' = '#000000'
)
# list 转变为向量
colors <- as.vector(unlist(top_language_colors))
levels <- names(top_language_colors)
- 通过结合函数
filter()
、group_by()
、summarize()
、arrange()
、select()
、mutate()
等函数结合使用处理数据
top_packages <- data %>%
filter(language %in% popular_languages) %>% # 从数据中筛选出主流的编程语言,即上面 popular_languages 中涉及的。
group_by(pkg_name) %>% # 按照每种语言中的包进行分组
summarize(total_code = sum(code)) %>% # 然后求和
arrange(desc(total_code)) %>% # 按照上面的得到的 total_code 结果进行降序排列
head(number_of_pkgs) %>% # 输出前三百个包
select(pkg_name, total_code) # 选择 pkg_name 和 total_code 两列
top_languages_per_pkg <- data %>%
filter(
pkg_name %in% top_packages$pkg_name, # 筛选 data 中由上一步得到的含有前三百个包的信息
language %in% popular_languages
) %>%
left_join(top_packages, by = 'pkg_name') %>% # 此函数表示合并两个表格,输出两者共有的包的行,即将 total_code 信息列加入
mutate(language = factor(language, levels = levels)) %>% # 将 language 列变为因子,并且通过 level 指定因子排列顺序
arrange(language, pkg_name) # 按照 language 和 pkg_name 列进行排序, 先前者再后者
top_languages <- top_languages_per_pkg %>%
group_by(language) %>% # 按照计算机语言 language 进行分组
summarize(total_code = sum(code)) %>% # 统计每一种语言所有的 code 之和
arrange(language) %>% # 按照 language 列进行排序
ungroup() # 取消分组
vertices1 <- edges1 %>%
group_by(to) %>% # 按照 to 列即包名进行分组
top_n(1, total_code) %>% # 筛选每个包 total_code 最大的语言
ungroup() %>% # 取消分组
transmute(to, label = to, code = total_code, color = from) %>% # 创建数据列,将计算机语言作为颜色
left_join(
count(edges1, to, wt = total_code, name = 'total_code'), by = 'to'
) %>% # count() 函数计算 edges1 中每一个包名的在所有计算机语言中的 total_code 之和
rename(node = to) %>%
mutate(level = 1, color = factor(color, levels = levels)) %>%
arrange(color, node)
edges1 <- left_join(
edges1,
edges1 %>%
left_join(vertices1, by = c('to' = 'node', 'from' = 'color')) %>%
transmute(from, to, alpha = level) %>%
replace_na(list(alpha = 0))
)
edges <- bind_rows(edges1, edges2)
vertices2 <- edges2 %>%
transmute(
node = to, label = to, code = total_code, color = to, total_code, level = 2
) %>%
arrange(node)
vertices3 = tibble(
node = 'All', label = '', code = 0, color = NA, total_code = 0, level = 3
)
vertices = bind_rows(vertices1, vertices2, vertices3) %>%
mutate(radius = total_code**(1.8))
绘图
ggraph(layout, circular = TRUE) +
geom_edge_diagonal(
aes(edge_color = node1.color, edge_alpha = as.factor(alpha)),
edge_width = 0.3, show.legend = FALSE
) +
geom_node_point(
aes(size = radius, color = color),
alpha = 0.6, show.legend = FALSE
) +
geom_node_text(
aes(
x = 1.0175 * x,
y = 1.0175 * y,
label = label,
angle = -((-node_angle(x, y) + 90) %% 180) + 90,
filter = !(label %in% top_languages$language)
),
size = 2, hjust = 'outward', family = 'Oswald'
) +
geom_node_text(
aes(
x = x,
y = y,
label = label,
filter = label %in% top_languages$language
),
size = 6, hjust = 0.5, family = 'Oswald',
point.padding = NA, repel = TRUE
) +
geom_node_text(
aes(
x = x,
y = y - 0.045,
label = ifelse(
total_code > 1000,
format(total_code, big.mark = ','),
total_code
),
filter = label %in% top_languages$language
),
size = 3, hjust = 0.5, family = 'Oswald',
point.padding = NA, repel = TRUE
) +
scale_edge_color_manual(values = colors, guide = FALSE) +
scale_color_manual(values = colors, guide = FALSE) +
scale_size_area(max_size = 150, guide = FALSE) +
scale_edge_alpha_manual(values = c(0.15, 1), guide = FALSE) +
coord_fixed() +
labs(
title = 'LOC of Popular Programming Languages in 300 CRAN Packages',
subtitle = 'considered are largest CRAN packages written in one (or more) of top 16 programming languages from Tiobe Index (Nov. 2019)',
caption = '#tidytuesday 46|2019 spren9er'
) +
theme_void() +
theme(
text = element_text(family = 'Oswald'),
legend.position = c(0.645, 0.51),
plot.title = element_text(
face = 'bold', hjust = 0.5, size = 20, margin = margin(t = 45, b = 3)
),
plot.subtitle = element_text(
face = 'plain', hjust = 0.5, size = 13, margin = margin(t = 5, b = 3)),
plot.caption = element_text(
face = 'plain', color = '#dedede', size = 8, hjust = 1,
margin = margin(b = 20)
)
)
# 保存到本地
ggsave(
'E://swrj/R3.6/Tidytuesday/tidytuesday_201946_cran_packages.png',
width = 12, height = 12.5, dpi = 300
)
-
出图效果