病案首页汇总数据_15Sep2020

项目开始于: 10Sep2020 最后更新于: 14Sep2020

目的:

为完成2020年“NCIS医疗质量控制数据收集系统”数据上传,计算“2019年三级、二级综合医院:医疗质量管理控制情况调查表”某些指标而建立

前提条件

您需要安装

  • R, 必须(具体版本或可参考本文最后的信息)
  • Rstudio, 或许不必要,我未测试
  • 设置环境变量 --> 可选,非必须,参考 https://mp.weixin.qq.com/s/O1Lkfql058avzBCqL_Wdsw

操作

  • 如果您的机器已设置环境变量,双击 “_run.bat”即可运行,等待黑框框自动消失,在"res.txt"查看结果,注意该文件生成的时间应当是当前系统时间

  • 如果您的机器未设置环境变量,双击 “quality_data_analysis_RJL_10Sep2020.Rproj”,运行do_part0_run.R后,等待运行结果,在"res.txt"查看结果,注意该文件生成的时间应当是当前系统时间

文件夹和文件说明(文件夹树见在后面)

  • "res.txt"中查看结果
  • “变量含义对照表.txt”中记录了本次分析中所使用到的变量名及其意义,来自于“HQMS数据对接接口标准.pdf”文档
  • 因为提供的数据是分月的,在“data_source”文件夹中,进行和合并(添加行),“mydata_after_stacking.RData”为合并后的数据(您可以转化后用其他软件做其他的分析)
  • 以".R"结束的代码为真正的分析过程代码,应当很明了,分了不同的版块
  • “do_part0_run.R”中可以设置运行哪些版块,有时候我们并不需要计算所有指标
  • 我成功运行的平台,信息如下
R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=Chinese (Simplified)_People's Republic of China.936 
[2] LC_CTYPE=Chinese (Simplified)_People's Republic of China.936   
[3] LC_MONETARY=Chinese (Simplified)_People's Republic of China.936
[4] LC_NUMERIC=C                                                   
[5] LC_TIME=Chinese (Simplified)_People's Republic of China.936    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
[1] forcats_0.5.0   stringr_1.4.0   dplyr_1.0.1     purrr_0.3.4    
[5] readr_1.3.1     tidyr_1.1.1     tibble_3.0.3    ggplot2_3.3.2  
[9] tidyverse_1.3.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.5       cellranger_1.1.0 pillar_1.4.6     compiler_4.0.2  
 [5] dbplyr_1.4.4     tools_4.0.2      jsonlite_1.7.0   lubridate_1.7.9 
 [9] lifecycle_0.2.0  gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.7     
[13] reprex_0.3.0     cli_2.0.2        DBI_1.1.0        rstudioapi_0.11 
[17] haven_2.3.1      xfun_0.16        withr_2.2.0      xml2_1.3.2      
[21] httr_1.4.2       fs_1.5.0         generics_0.0.2   vctrs_0.3.2     
[25] hms_0.5.3        grid_4.0.2       tidyselect_1.1.0 glue_1.4.1      
[29] R6_2.4.1         fansi_0.4.1      readxl_1.3.1     modelr_0.1.8    
[33] blob_1.2.1       magrittr_1.5     backports_1.1.7  scales_1.1.1    
[37] ellipsis_0.3.1   rvest_0.3.6      assertthat_0.2.1 colorspace_1.4-1
[41] utf8_1.1.4       tinytex_0.25     stringi_1.4.6    munsell_0.5.0   
[45] broom_0.7.0      crayon_1.3.4 

文件夹结构

quality_data_analysis_RJL_10Sep2020
 ├── data_source
 │   ├── HQMS_db_10_1718.csv
 │   ├── HQMS_db_11_1907.csv
 │   ├── HQMS_db_12_1891.csv
 │   ├── HQMS_db_1_1974.csv
 │   ├── HQMS_db_2 1521.csv
 │   ├── HQMS_db_3_2131.csv
 │   ├── HQMS_db_4_1903.csv
 │   ├── HQMS_db_5_1792.csv
 │   ├── HQMS_db_6_1863.csv
 │   ├── HQMS_db_7_1995.csv
 │   ├── HQMS_db_8_1919.csv
 │   └── HQMS_db_9_1726.csv
 ├── do_part0_functions.R
 ├── do_part0_run.R
 ├── do_part0_setupAndGetData.R
 ├── do_part1_20MainDiseases.R
 ├── do_part2_20MainOpreations.R
 ├── do_part3_16MainTumorsWithoutOpreatins.R
 ├── do_part4_14MainTumorsWithOpreatins.R
 ├── HQMS数据对接接口标准.pdf
 ├── mydata_after_stacking.RData
 ├── quality_data_analysis_RJL_10Sep2020.Rproj
 ├── README.txt
 ├── res.txt
 ├── _run.bat
 ├── 变量含义对照表.txt
 └── 需核对项目.rtf

代码

_run.bat

Rscript do_part0_run.R
exit

do_part0_run.R

source("do_part0_functions.R", echo = F)
source("do_part0_setupAndGetData.R", echo = F)

res2file <- T
if (res2file) sink("res.txt")

my_br(date())

if (T) my_br(1); source("do_part1_20MainDiseases.R", echo = T)
if (T) my_br(2); source("do_part2_20MainOpreations.R", echo = T)
if (T) my_br(3); source("do_part3_16MainTumorsWithoutOpreatins.R", echo = T)
if (T) my_br(4); source("do_part4_16MainTumorsWithOpreatins.R", echo = T)

if (res2file) sink()

do_part0_functions.R

# functions ---------------------------------------------------------------

list2df <- function(list) {
  as_tibble(do.call(rbind, list))
}

toreg <- function(x) str_c("^", str_replace(x, "\\.", "\\\\."))

mul_detect <- function(mat, strs) {
  out <- list(length = length(strs))
  for (i in  seq_along(strs)) {
    out[[i]] <- str_detect(mat, strs[i])
  }
  reduce(out, `|`)
}

mymulfilter <- function(df, nm, cond_strs, keep = TRUE) {
  mat_data <- as.matrix(df[, nm])
  vec_logic <- mul_detect(mat_data, cond_strs)
  mat_logic <- matrix(vec_logic, nrow(mat_data), ncol(mat_data))
  ind <- apply(mat_logic,1, any, na.rm = TRUE)
  if (keep) df[ind, ] else df[!ind, ] 
}

mysum <- function(df) {
  summarise(df, 
            number_cases = n(),
            death_cases = sum(.data$P741 == "5"), 
            days_hopital = sum(P27), 
            cost = sum(P782, na.rm = T))
}

my_br <- function(number) {
  print("---------------------------------------------------------")
  print(str_c("*************************part ", number, "**************************"))
  print("---------------------------------------------------------")
}

do_part0_setupAndGetData.R


# setup and prepare data --------------------------------------------------

pri_diag <- "P321"
diagnosis <- c("P321", "P324", "P327", "P3291", "P3294", 
               "P3297", "P3281", "P3284", "P3287", "P3271", "P3274")
op <- c("P490", "P4911", "P4922", "P4533", "P4544", 
        "P45002", "P45014", "P45026", "P45038", "P45050")


library(tidyverse)
file_nms <- dir("./data_source")

out <- list(length = length(file_nms))
for (i in seq_along(file_nms)) {
  out[[i]] <- read.csv(str_c("./data_source/", file_nms[i]))
}

mydata <- list2df(out)
save(mydata, file = "mydata_after_stacking.RData")

mydata2 <- mydata %>% select(P3, P4, P26, P27, P741, 
                             P321, 
                             P324, P327, P3291, P3294, P3297, P3281, P3284, P3287, P3271, P3274,
                             P490, P4911, P4922, P4533, P4544, P45002, P45014, P45026, P45038, P45050,
                             P782)
names(mydata2) # please check the api document to confirm

do_part1_20MainDiseases.R

tempfil <- function(df) {
  mymulfilter(df, pri_diag, keep_diag, TRUE) %>% 
    mymulfilter(diagnosis, "Z37", FALSE) %>%
    mymulfilter(op, drop_op, FALSE)
}

# filter the interested rows and compute statistics------------------------

keep_diag <- c(str_c("I21.", 0:3), "I21.4", "I21.9") %>% toreg
drop_op <- c("74.0", "74.1", "74.2", "74.4", "74.99", "37.2") %>% toreg
mydata2 %>% tempfil %>% mysum
# ---
keep_diag <- c("I105", "I106", "I107", "I108", "I109", "I11", "I12", "I13", "I20")
drop_op <- c("74.0", "74.1", "74.2", "74.4", "74.99", "35", 
             "36", "37", "38", "39") %>% toreg
mydata2 %>%
  mymulfilter(diagnosis, keep_diag, TRUE) %>% 
  mymulfilter(diagnosis, "Z37", FALSE) %>%
  mymulfilter(op, drop_op, FALSE) %>%
  mysum
# ---
keep_diag <- c("I60", "I61", "I62", "I63")
drop_op <- c("74.0", "74.1", "74.2", "74.4", "74.99") %>% toreg
mydata2 %>% tempfil %>% mysum
# ---
keep_diag <- c("S06")
mydata2 %>% tempfil %>% mysum
# ---
keep_diag <- c("K25.0", "K25.2", "K25.4", "K25.6",
               "K26.0", "K26.2", "K26.4", "K26.6", 
               "K27.0", "K27.2", "K27.4", "K27.6",
               "K28.0", "K28.2", "K28.4", "K28.6",
               "K29.0", "K29.2") %>% toreg()
mydata2 %>% tempfil %>% mysum

# ---
(keep_diag <- str_c("T0", 1:7) %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c(str_c("J", 12:16), "J18") %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "J44" %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c(str_c("E1", 0:4, ".1"), str_c("E1", 0:4, ".0")) %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c(str_c("E10", ".", 2:8),
                str_c("E11", ".", 2:8),
                str_c("E12", ".", 2:8),
                str_c("E13", ".", 2:8),
                str_c("E14", ".", 2:8)
) %>% toreg())

mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "E04."  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("K35.0", "K35.1")  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "N40"  %>% toreg())

mydata2 %>% mymulfilter(pri_diag, keep_diag, TRUE) %>%  mysum
# ---
(keep_diag <- c("N17","N18", "N19")  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("A40","A41", "A22.7", "A26.7", "A28.001", "A32.7", "B37.7")  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- str_c("I1", 0:5)  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- "K85"  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("Z51.1", "Z51.2", "Z51.8")  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("S71", "S72", "S73", "S82", "S83")  %>% toreg())
mydata2 %>% tempfil %>% mysum
# ---
(keep_diag <- c("J45", "J46")  %>% toreg)
mydata2 %>% tempfil %>% mysum
# ---

# --- the last disease do not need to compute, Cheng gives it to me

do_part2_20MainOpreations.R

# operation we interested -------------------------------------------------

(keep_op <- c(str_c("00.", 70:77), str_c("00.", 80:83), str_c("81.", 51:55)))
mydata2 %>% mymulfilter(op, keep_op) %>% mysum

# --- ok

myfil_sum <- function(keep_op1) {
  mysum(mymulfilter(
    mydata2, 
    c("P490", "P4911", "P4922", "P4533", "P4544", "P45002", "P45014", "P45026", "P45038", "P45050"),
    keep_op1,
    TRUE))
}

c(str_c("03.0", 1:9), str_c("03.", 40:79), str_c("81.0", 1:9), 
  str_c("81.", 10:38), str_c("81.", 62:66), str_c("84.", 61:68)) %>% toreg %>% myfil_sum

# --- ok
c(str_c("79.", 31:39), str_c("79.8", 1:9)) %>% toreg %>% myfil_sum()

# --- ok
c(str_c("01.", 21:59), str_c("02.0", 1:9), str_c("02.", 10:99)) %>%
  toreg %>%
  myfil_sum 

# ---

rfs <- function(str) {
  print(str %>% toreg) 
  str %>% toreg %>% myfil_sum
}

str_c("00.6", 1:5) %>% rfs  #ok
str_c("36.1", 0:7) %>% rfs  #ok
c("00.66", "36.06", "36.07") %>% rfs #ok
str_c("35.2", 1:8) %>% rfs #ok
str_c("42.", 41:65) %>% rfs #ok
str_c("32.", 20:60) %>% rfs #ok
str_c("52.", 51:96) %>% rfs #ok
str_c("43.", 50:99) %>% rfs #ok
str_c("48.", 40:69) %>% rfs #ok
seq(51.03, 51.99, .01) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
seq(85.21, 85.89, .01) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
c(seq(55.40, 55.69, .01), seq(60.21, 60.69, .01)) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
c(seq(38.02, 38.18, .01), seq(38.30, 38.89, .01), seq(39.00, 39.59, .01)) %>% sprintf("%.2f", .) %>% as.character %>% rfs #ok
scrfs <- function(str_) str_ %>% sprintf("%.2f", .) %>% as.character %>% rfs
seq(68.41, 68.90, .01) %>% scrfs()
c("74.0", "74.1", "74.2", "74.4", "74.99") %>% rfs
c(seq(72.0, 72.29, .01), seq(73.01, 73.21, .01), seq(73.40, 73.94, .01)) %>% 
  sprintf("%.2f", .) %>%
  toreg %>%
  mymulfilter(mydata2, op, ., T) %>%
  mymulfilter(., diagnosis, "Z37", T) %>%
  mysum() # the last disease do not need to compute, Cheng gives it to me

do_part3_16MainTumorsWithoutOpreatins.R

t_sum <- function(str) {
  print(str)
  mymulfilter(df = mydata2, nm = diagnosis, cond_strs = toreg(str)) %>%
    mymulfilter(nm = diagnosis, 
                cond_strs = toreg(c("Z51.001", "Z51.002", "Z51.003", "Z51.101", 
                                    "Z51.102", "Z51.103", "Z51.202", "Z51.203", 
                                    "Z51.204", "Z51.205", "Z51.206", "Z51.207", 
                                    "Z51.502"))) %>%
    mysum
}

list(
  "C34",
  c("C18", "C19", "C29"),
  "C16",
  "C50",
  "C22",
  "C15",
  "C25",
  "C67",
  "C64",
  c("C54", "D06"),
  "C73",
  "C32",
  "C56",
  "C61",
  "C11",
  str_c("C", 81:85)
) %>%
  map(t_sum)

do_part4_14MainTumorsWithOpreatins.R

to_sum <- function(str_diseases, str_opreations) {
  print(
    c(
      str_diseases, 
      "op ---->", str_opreations)
    )
  
  mymulfilter(df = mydata2, nm = diagnosis, cond_strs = toreg(str_diseases), keep = T) %>%
  mymulfilter(nm = op, cond_strs = toreg(str_opreations), keep = T) %>%
  mysum
}

mymap2 <- function(myfun, x, y) map2(x, y, myfun)

to_sum %>% 
  mymap2(
    list(
      "C34",
      c("C18", "C19", "C20"),
      "C16",
      "C50",
      "C22",
      "C15",
      "C25",
      "C67",
      "C64",
      c("C53", "D06"),
      "C73",
      "C32",
      "C56"
    ),
    
    list(
      c("32.4", "32.5", "32.6"),
      c("45.7", "48.4", "48.5", "48.6"),
      c("43.5", "43.6", "43.7", "43.9"),
      c("85.4", "85.21"),
      c("50.2", "50.3", "50.4", "50.5"),
      c("42.5", "42.6"),
      c("52.5", "52.7"),
      "57.7",
      c("55.3", "55.5"),
      c("40.59", "65.6", "67.2", "68.4"),
      str_c("06.", 2:5),
      c("30.3", "30.4"),
      c("65.6", "40.59")
    )
  ) # the last disease(13) do not need to compute, Cheng gives it to me, 4 cases

mydata %>%
  dplyr::filter(P7 >= 18) %>%
  mymulfilter(nm = diagnosis, cond_strs = "C61" %>% toreg, keep = T) %>%
  mymulfilter(nm = op, cond_strs = "60.5" %>% toreg, keep = T) %>%
  mysum

# item 13
mydata2 %>%
  filter(P3 %in% c("00119459", "00116203", "00122887", "00114805")) # no such cases

你可能感兴趣的:(病案首页汇总数据_15Sep2020)