项目开始于: 10Sep2020 最后更新于: 14Sep2020
目的:
为完成2020年“NCIS医疗质量控制数据收集系统”数据上传,计算“2019年三级、二级综合医院:医疗质量管理控制情况调查表”某些指标而建立
前提条件
您需要安装
- R, 必须(具体版本或可参考本文最后的信息)
- Rstudio, 或许不必要,我未测试
- 设置环境变量 --> 可选,非必须,参考 https://mp.weixin.qq.com/s/O1Lkfql058avzBCqL_Wdsw
操作
如果您的机器已设置环境变量,双击 “_run.bat”即可运行,等待黑框框自动消失,在"res.txt"查看结果,注意该文件生成的时间应当是当前系统时间
如果您的机器未设置环境变量,双击 “quality_data_analysis_RJL_10Sep2020.Rproj”,运行do_part0_run.R后,等待运行结果,在"res.txt"查看结果,注意该文件生成的时间应当是当前系统时间
文件夹和文件说明(文件夹树见在后面)
- "res.txt"中查看结果
- “变量含义对照表.txt”中记录了本次分析中所使用到的变量名及其意义,来自于“HQMS数据对接接口标准.pdf”文档
- 因为提供的数据是分月的,在“data_source”文件夹中,进行和合并(添加行),“mydata_after_stacking.RData”为合并后的数据(您可以转化后用其他软件做其他的分析)
- 以".R"结束的代码为真正的分析过程代码,应当很明了,分了不同的版块
- “do_part0_run.R”中可以设置运行哪些版块,有时候我们并不需要计算所有指标
- 我成功运行的平台,信息如下
R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=Chinese (Simplified)_People's Republic of China.936
[2] LC_CTYPE=Chinese (Simplified)_People's Republic of China.936
[3] LC_MONETARY=Chinese (Simplified)_People's Republic of China.936
[4] LC_NUMERIC=C
[5] LC_TIME=Chinese (Simplified)_People's Republic of China.936
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.1 purrr_0.3.4
[5] readr_1.3.1 tidyr_1.1.1 tibble_3.0.3 ggplot2_3.3.2
[9] tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 cellranger_1.1.0 pillar_1.4.6 compiler_4.0.2
[5] dbplyr_1.4.4 tools_4.0.2 jsonlite_1.7.0 lubridate_1.7.9
[9] lifecycle_0.2.0 gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.7
[13] reprex_0.3.0 cli_2.0.2 DBI_1.1.0 rstudioapi_0.11
[17] haven_2.3.1 xfun_0.16 withr_2.2.0 xml2_1.3.2
[21] httr_1.4.2 fs_1.5.0 generics_0.0.2 vctrs_0.3.2
[25] hms_0.5.3 grid_4.0.2 tidyselect_1.1.0 glue_1.4.1
[29] R6_2.4.1 fansi_0.4.1 readxl_1.3.1 modelr_0.1.8
[33] blob_1.2.1 magrittr_1.5 backports_1.1.7 scales_1.1.1
[37] ellipsis_0.3.1 rvest_0.3.6 assertthat_0.2.1 colorspace_1.4-1
[41] utf8_1.1.4 tinytex_0.25 stringi_1.4.6 munsell_0.5.0
[45] broom_0.7.0 crayon_1.3.4
文件夹结构
quality_data_analysis_RJL_10Sep2020
├── data_source
│ ├── HQMS_db_10_1718.csv
│ ├── HQMS_db_11_1907.csv
│ ├── HQMS_db_12_1891.csv
│ ├── HQMS_db_1_1974.csv
│ ├── HQMS_db_2 1521.csv
│ ├── HQMS_db_3_2131.csv
│ ├── HQMS_db_4_1903.csv
│ ├── HQMS_db_5_1792.csv
│ ├── HQMS_db_6_1863.csv
│ ├── HQMS_db_7_1995.csv
│ ├── HQMS_db_8_1919.csv
│ └── HQMS_db_9_1726.csv
├── do_part0_functions.R
├── do_part0_run.R
├── do_part0_setupAndGetData.R
├── do_part1_20MainDiseases.R
├── do_part2_20MainOpreations.R
├── do_part3_16MainTumorsWithoutOpreatins.R
├── do_part4_14MainTumorsWithOpreatins.R
├── HQMS数据对接接口标准.pdf
├── mydata_after_stacking.RData
├── quality_data_analysis_RJL_10Sep2020.Rproj
├── README.txt
├── res.txt
├── _run.bat
├── 变量含义对照表.txt
└── 需核对项目.rtf
代码
_run.bat
Rscript do_part0_run.R
exit
do_part0_run.R
source("do_part0_functions.R", echo = F)
source("do_part0_setupAndGetData.R", echo = F)
res2file <- T
if (res2file) sink("res.txt")
my_br(date())
if (T) my_br(1); source("do_part1_20MainDiseases.R", echo = T)
if (T) my_br(2); source("do_part2_20MainOpreations.R", echo = T)
if (T) my_br(3); source("do_part3_16MainTumorsWithoutOpreatins.R", echo = T)
if (T) my_br(4); source("do_part4_16MainTumorsWithOpreatins.R", echo = T)
if (res2file) sink()
do_part0_functions.R
# functions ---------------------------------------------------------------
list2df <- function(list) {
as_tibble(do.call(rbind, list))
}
toreg <- function(x) str_c("^", str_replace(x, "\\.", "\\\\."))
mul_detect <- function(mat, strs) {
out <- list(length = length(strs))
for (i in seq_along(strs)) {
out[[i]] <- str_detect(mat, strs[i])
}
reduce(out, `|`)
}
mymulfilter <- function(df, nm, cond_strs, keep = TRUE) {
mat_data <- as.matrix(df[, nm])
vec_logic <- mul_detect(mat_data, cond_strs)
mat_logic <- matrix(vec_logic, nrow(mat_data), ncol(mat_data))
ind <- apply(mat_logic,1, any, na.rm = TRUE)
if (keep) df[ind, ] else df[!ind, ]
}
mysum <- function(df) {
summarise(df,
number_cases = n(),
death_cases = sum(.data$P741 == "5"),
days_hopital = sum(P27),
cost = sum(P782, na.rm = T))
}
my_br <- function(number) {
print("---------------------------------------------------------")
print(str_c("*************************part ", number, "**************************"))
print("---------------------------------------------------------")
}
do_part0_setupAndGetData.R
# setup and prepare data --------------------------------------------------
pri_diag <- "P321"
diagnosis <- c("P321", "P324", "P327", "P3291", "P3294",
"P3297", "P3281", "P3284", "P3287", "P3271", "P3274")
op <- c("P490", "P4911", "P4922", "P4533", "P4544",
"P45002", "P45014", "P45026", "P45038", "P45050")
library(tidyverse)
file_nms <- dir("./data_source")
out <- list(length = length(file_nms))
for (i in seq_along(file_nms)) {
out[[i]] <- read.csv(str_c("./data_source/", file_nms[i]))
}
mydata <- list2df(out)
save(mydata, file = "mydata_after_stacking.RData")
mydata2 <- mydata %>% select(P3, P4, P26, P27, P741,
P321,
P324, P327, P3291, P3294, P3297, P3281, P3284, P3287, P3271, P3274,
P490, P4911, P4922, P4533, P4544, P45002, P45014, P45026, P45038, P45050,
P782)
names(mydata2) # please check the api document to confirm
do_part1_20MainDiseases.R
tempfil <- function(df) {
mymulfilter(df, pri_diag, keep_diag, TRUE) %>%
mymulfilter(diagnosis, "Z37", FALSE) %>%
mymulfilter(op, drop_op, FALSE)
}
# filter the interested rows and compute statistics------------------------
keep_diag <- c(str_c("I21.", 0:3), "I21.4", "I21.9") %>% toreg
drop_op <- c("74.0", "74.1", "74.2", "74.4", "74.99", "37.2") %>% toreg
mydata2 %>% tempfil %>% mysum
# ---
keep_diag <- c("I105", "I106", "I107", "I108", "I109", "I11", "I12", "I13", "I20")
drop_op <- c("74.0", "74.1", "74.2", "74.4", "74.99", "35",
"36", "37", "38", "39") %>% toreg
mydata2 %>%
mymulfilter(diagnosis, keep_diag, TRUE) %>%
mymulfilter(diagnosis, "Z37", FALSE) %>%
mymulfilter(op, drop_op, FALSE) %>%
mysum
# ---
keep_diag <- c("I60", "I61", "I62", "I63")
drop_op <- c("74.0", "74.1", "74.2", "74.4", "74.99") %>% toreg
mydata2 %>% tempfil %>% mysum
# ---
keep_diag <- c("S06")
mydata2 %>% tempfil %>% mysum
# ---
keep_diag <- c("K25.0", "K25.2", "K25.4", "K25.6",
"K26.0", "K26.2", "K26.4", "K26.6",
"K27.0", "K27.2", "K27.4", "K27.6",
"K28.0", "K28.2", "K28.4", "K28.6",
"K29.0", "K29.2") %>% toreg()
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- str_c("T0", 1:7) %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c(str_c("J", 12:16), "J18") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "J44" %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c(str_c("E1", 0:4, ".1"), str_c("E1", 0:4, ".0")) %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c(str_c("E10", ".", 2:8),
str_c("E11", ".", 2:8),
str_c("E12", ".", 2:8),
str_c("E13", ".", 2:8),
str_c("E14", ".", 2:8)
) %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "E04." %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("K35.0", "K35.1") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "N40" %>% toreg())
mydata2 %>% mymulfilter(pri_diag, keep_diag, TRUE) %>% mysum
# ---
(keep_diag <- c("N17","N18", "N19") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("A40","A41", "A22.7", "A26.7", "A28.001", "A32.7", "B37.7") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- str_c("I1", 0:5) %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "K85" %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("Z51.1", "Z51.2", "Z51.8") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("S71", "S72", "S73", "S82", "S83") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("J45", "J46") %>% toreg)
mydata2 %>% tempfil %>% mysum
# ---
# --- the last disease do not need to compute, Cheng gives it to me
do_part2_20MainOpreations.R
# operation we interested -------------------------------------------------
(keep_op <- c(str_c("00.", 70:77), str_c("00.", 80:83), str_c("81.", 51:55)))
mydata2 %>% mymulfilter(op, keep_op) %>% mysum
# --- ok
myfil_sum <- function(keep_op1) {
mysum(mymulfilter(
mydata2,
c("P490", "P4911", "P4922", "P4533", "P4544", "P45002", "P45014", "P45026", "P45038", "P45050"),
keep_op1,
TRUE))
}
c(str_c("03.0", 1:9), str_c("03.", 40:79), str_c("81.0", 1:9),
str_c("81.", 10:38), str_c("81.", 62:66), str_c("84.", 61:68)) %>% toreg %>% myfil_sum
# --- ok
c(str_c("79.", 31:39), str_c("79.8", 1:9)) %>% toreg %>% myfil_sum()
# --- ok
c(str_c("01.", 21:59), str_c("02.0", 1:9), str_c("02.", 10:99)) %>%
toreg %>%
myfil_sum
# ---
rfs <- function(str) {
print(str %>% toreg)
str %>% toreg %>% myfil_sum
}
str_c("00.6", 1:5) %>% rfs #ok
str_c("36.1", 0:7) %>% rfs #ok
c("00.66", "36.06", "36.07") %>% rfs #ok
str_c("35.2", 1:8) %>% rfs #ok
str_c("42.", 41:65) %>% rfs #ok
str_c("32.", 20:60) %>% rfs #ok
str_c("52.", 51:96) %>% rfs #ok
str_c("43.", 50:99) %>% rfs #ok
str_c("48.", 40:69) %>% rfs #ok
seq(51.03, 51.99, .01) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
seq(85.21, 85.89, .01) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
c(seq(55.40, 55.69, .01), seq(60.21, 60.69, .01)) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
c(seq(38.02, 38.18, .01), seq(38.30, 38.89, .01), seq(39.00, 39.59, .01)) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
scrfs <- function(str_) str_ %>% sprintf("%.2f", .) %>% as.character %>% rfs
seq(68.41, 68.90, .01) %>% scrfs()
c("74.0", "74.1", "74.2", "74.4", "74.99") %>% rfs
c(seq(72.0, 72.29, .01), seq(73.01, 73.21, .01), seq(73.40, 73.94, .01)) %>%
sprintf("%.2f", .) %>%
toreg %>%
mymulfilter(mydata2, op, ., T) %>%
mymulfilter(., diagnosis, "Z37", T) %>%
mysum() # the last disease do not need to compute, Cheng gives it to me
do_part3_16MainTumorsWithoutOpreatins.R
t_sum <- function(str) {
print(str)
mymulfilter(df = mydata2, nm = diagnosis, cond_strs = toreg(str)) %>%
mymulfilter(nm = diagnosis,
cond_strs = toreg(c("Z51.001", "Z51.002", "Z51.003", "Z51.101",
"Z51.102", "Z51.103", "Z51.202", "Z51.203",
"Z51.204", "Z51.205", "Z51.206", "Z51.207",
"Z51.502"))) %>%
mysum
}
list(
"C34",
c("C18", "C19", "C29"),
"C16",
"C50",
"C22",
"C15",
"C25",
"C67",
"C64",
c("C54", "D06"),
"C73",
"C32",
"C56",
"C61",
"C11",
str_c("C", 81:85)
) %>%
map(t_sum)
do_part4_14MainTumorsWithOpreatins.R
to_sum <- function(str_diseases, str_opreations) {
print(
c(
str_diseases,
"op ---->", str_opreations)
)
mymulfilter(df = mydata2, nm = diagnosis, cond_strs = toreg(str_diseases), keep = T) %>%
mymulfilter(nm = op, cond_strs = toreg(str_opreations), keep = T) %>%
mysum
}
mymap2 <- function(myfun, x, y) map2(x, y, myfun)
to_sum %>%
mymap2(
list(
"C34",
c("C18", "C19", "C20"),
"C16",
"C50",
"C22",
"C15",
"C25",
"C67",
"C64",
c("C53", "D06"),
"C73",
"C32",
"C56"
),
list(
c("32.4", "32.5", "32.6"),
c("45.7", "48.4", "48.5", "48.6"),
c("43.5", "43.6", "43.7", "43.9"),
c("85.4", "85.21"),
c("50.2", "50.3", "50.4", "50.5"),
c("42.5", "42.6"),
c("52.5", "52.7"),
"57.7",
c("55.3", "55.5"),
c("40.59", "65.6", "67.2", "68.4"),
str_c("06.", 2:5),
c("30.3", "30.4"),
c("65.6", "40.59")
)
) # the last disease(13) do not need to compute, Cheng gives it to me, 4 cases
mydata %>%
dplyr::filter(P7 >= 18) %>%
mymulfilter(nm = diagnosis, cond_strs = "C61" %>% toreg, keep = T) %>%
mymulfilter(nm = op, cond_strs = "60.5" %>% toreg, keep = T) %>%
mysum
# item 13
mydata2 %>%
filter(P3 %in% c("00119459", "00116203", "00122887", "00114805")) # no such cases