R语言完美实现PSM算法,完成医学倾向性匹配需求,附完整代码

#PSM new
library(MatchIt)
setwd("C:/Users/jack/Desktop/mission/psm/1")
library(readxl)
plan1 <- as.data.frame(read_excel("plan1.xlsx",col_types = c("text","text","text","numeric","numeric","numeric","numeric","numeric","numeric",'text')))
plan1$ITEM_NAME[plan1$ITEM_NAME=='国产']<-1#国产是1,进口是0
plan1$ITEM_NAME[plan1$ITEM_NAME=='进口']<-0#国产是1,进口是0

#去重到人次
plan1<-plan1[!duplicated(plan1),]
library(MatchIt)
library(dplyr)
library(ggplot2)

#1 Pre-analysis using non-matched data
#1.1 Difference-in-means: outcome variable:出院方式
chisq.test(rbind(table(plan1$出院方式[plan1$ITEM_NAME==1]),table(plan1$出院方式[plan1$ITEM_NAME==0])))#p-value = 0.03613

#1.2 Difference-in-means: pre-treatment covariates
ecls_cov <- c('AGE', '生化-肾功能里的肌酐(用药前)', '生化-肝功能里的谷丙转氨酶(用药前)', '生化-肝功能里的谷草转氨酶(用药前)', '生化-肝功能里的总胆红素(用药前)' , '生化-肝功能里的直接胆红素(用药前)')
result0<-as.data.frame(
plan1 %>%
  group_by(ITEM_NAME) %>%
  select(one_of(ecls_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))
)#连续变量的均值比较

tableGrob(result0)

chisq.test(rbind(table(plan1$SEX[plan1$ITEM_NAME==1]),table(plan1$SEX[plan1$ITEM_NAME==0])))
#组间男女比例差异:p-value = 0.1731

lapply(ecls_cov, function(v) {
  t.test(plan1[, v] ~ plan1[, 'ITEM_NAME'])
})#连续变量的组间t检验,只有age显著差异

#before matching
library(pacman)
plan1$ITEM_NAME
pacman::p_load(tableone) 
table1 <- CreateTableOne(vars = ecls_cov, data = plan1, strata = 'ITEM_NAME') 
table1 <- print(table1, printToggle = FALSE, noSpaces = TRUE) 
knitr::kable(table1[,1:3], align = 'c', caption = 'Comparison of unmatched samples') 

#after matching
library(pacman)
plan1$ITEM_NAME
pacman::p_load(tableone) 
table1 <- CreateTableOne(vars = ecls_cov, data = data.match2, strata = 'ITEM_NAME') 
table1 <- print(table1, printToggle = FALSE, noSpaces = TRUE) 
knitr::kable(table1[,1:3], align = 'c', caption = 'Comparison of unmatched samples') 

 


#----------------------------------------------------------------------------------------------------------
#2 Propensity score estimation
m_ps <- glm(ITEM_NAME ~ SEX + AGE + `生化-肾功能里的肌酐(用药前)` + `生化-肝功能里的谷丙转氨酶(用药前)` + `生化-肝功能里的谷草转氨酶(用药前)` + `生化-肝功能里的总胆红素(用药前)` + `生化-肝功能里的直接胆红素(用药前)`,
            family = binomial(), data = plan1)
summary(m_ps)#logistic model to calculate the propensity score

prs_df <- data.frame(pr_score = predict(m_ps, type = "response"),
                     group = plan1$ITEM_NAME)
head(prs_df)

#2.1 Examining the region of common support
labs <- paste("Actual medicine type:", c("local", "foreign"))
prs_df %>%
  mutate(group = ifelse(group == 0, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white") +
  facet_wrap(~group) +
  xlab("Probability of medicine type") +
  theme_bw()

dev.off()

#----------------------------------------------------------------------------------------------------------
#3 Executing a matching algorithm
plan11<-plan1[complete.cases(plan1),]
match.it1 <- matchit(ITEM_NAME ~ SEX + AGE + `生化-肾功能里的肌酐(用药前)` + `生化-肝功能里的谷丙转氨酶(用药前)` + `生化-肝功能里的谷草转氨酶(用药前)` + `生化-肝功能里的总胆红素(用药前)` + `生化-肝功能里的直接胆红素(用药前)`, data = plan11, method="nearest", ratio=3,reestimate='T',discard = 'both')
data.match1 <- match.data(match.it1)#matched data

#----------------------------------------------------------------------------------------------------------
#4 Examining covariate balance in the matched sample
as.data.frame(
  data.match1 %>%
    group_by(ITEM_NAME) %>%
    select(one_of(ecls_cov)) %>%
    summarise_all(funs(mean(., na.rm = T)))
)#连续变量的均值比较

lapply(ecls_cov, function(v) {
  t.test(data.match1[, v] ~ data.match1$ITEM_NAME)
})#匹配后age,生化-肝功能里的总胆红素(用药前)显著差异


#--------------------------------------------------------------------------------------------------------
#5 Estimating treatment effects
chisq.test(rbind(table(data.match1$出院方式[data.match1$ITEM_NAME==1]),table(data.match1$出院方式[data.match1$ITEM_NAME==0])))
#匹配后:疗效不存在显著差异 p-value = 0.1015

a <- summary(match.it1)
knitr::kable(a$nn, digits = 2, align = 'c', 
             caption = 'Sample sizes')
knitr::kable(a$sum.matched[c(1,2,4)], digits = 2, align = 'c', 
             caption = 'Summary of balance for matched data')

m_ps <- glm(ITEM_NAME ~ SEX + AGE + `生化-肾功能里的肌酐(用药前)` + `生化-肝功能里的谷丙转氨酶(用药前)` + `生化-肝功能里的谷草转氨酶(用药前)` + `生化-肝功能里的总胆红素(用药前)` + `生化-肝功能里的直接胆红素(用药前)`,
            family = binomial(), data = data.match1)
summary(m_ps)#logistic model to calculate the propensity score


#----------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------------
##plan2
plan2 <- as.data.frame(read_excel("plan2.xlsx",col_types = c("text","text","text","numeric","numeric","numeric","numeric","numeric","numeric",'text','text')))#国产是0,进口是1
plan2$ITEM_NAME[plan2$ITEM_NAME=='国产']<-1#国产是1,进口是0
plan2$ITEM_NAME[plan2$ITEM_NAME=='进口']<-0#国产是1,进口是0
#去重到人次
plan2<-plan2[!duplicated(plan2),]

 

#1 Pre-analysis using non-matched data
#与plan1一致
#菌种名
chisq.test(rbind(table(plan2$菌种名[plan2$ITEM_NAME==1]),table(plan2$菌种名[plan2$ITEM_NAME==0])))#p-value < 2.2e-16

#matching
plan22<-plan2[complete.cases(plan2),]
match.it2 <- matchit(ITEM_NAME ~ SEX + AGE + `生化-肾功能里的肌酐(用药前)` + `生化-肝功能里的谷丙转氨酶(用药前)` + `生化-肝功能里的谷草转氨酶(用药前)` + `生化-肝功能里的总胆红素(用药前)` + `生化-肝功能里的直接胆红素(用药前)` + `菌种名`, data = plan22, method="nearest", ratio=3)
data.match2 <- match.data(match.it2)#matched data
chisq.test(rbind(table(plan22$菌种名[plan22$ITEM_NAME==0]),table(plan22$菌种名[plan22$ITEM_NAME==1])))
chisq.test(rbind(table(data.match2$菌种名[data.match2$ITEM_NAME==0]),table(data.match2$菌种名[data.match2$ITEM_NAME==1])))


####匹配前后对比:
a <- summary(match.it2)
knitr::kable(a$nn, digits = 2, align = 'c', 
             caption = 'Sample sizes')
knitr::kable(a$sum.matched[c(1,2,4)], digits = 2, align = 'c', 
             caption = 'Summary of balance for matched data')

plot(match.it2, type = 'jitter', interactive = FALSE)
#匹配后:疗效不存在显著差异 p-value = 0.07435

as.data.frame(
  data.match2 %>%
    group_by(ITEM_NAME) %>%
    select(one_of(ecls_cov)) %>%
    summarise_all(funs(mean(., na.rm = T)))
)
lapply(ecls_cov, function(v) {
  t.test(data.match2[, v] ~ data.match2$ITEM_NAME)
})
chisq.test(rbind(table(data.match2$菌种名[data.match2$ITEM_NAME==1]),table(data.match2$菌种名[data.match2$ITEM_NAME==0])))#p-value < 2.2e-16

 

#----------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------------
##plan3
plan3 <- as.data.frame(read_excel("plan3.xlsx",col_types = c("text","text","text","numeric","numeric","numeric","numeric","numeric","numeric",'text','text','numeric')))#国产是0,进口是1
plan3$ITEM_NAME[plan3$ITEM_NAME=='国产']<-1#国产是1,进口是0
plan3$ITEM_NAME[plan3$ITEM_NAME=='进口']<-0#国产是1,进口是0
plan3 <- plan3[!duplicated(plan3),]

plan33<-plan3[complete.cases(plan3) & plan3$combination==2,]
match.it3 <- matchit(ITEM_NAME ~ SEX + AGE + `生化-肾功能里的肌酐(用药前)` + `生化-肝功能里的谷丙转氨酶(用药前)` + `生化-肝功能里的谷草转氨酶(用药前)` + `生化-肝功能里的总胆红素(用药前)` + `生化-肝功能里的直接胆红素(用药前)` + `菌种名`, data = plan33, method="nearest", ratio=3)
data.match3 <- match.data(match.it3)#matched data

a <- summary(match.it3)
knitr::kable(a$nn, digits = 2, align = 'c', 
             caption = 'Sample sizes')
knitr::kable(a$sum.matched[c(1,2,4)], digits = 2, align = 'c', 
             caption = 'Summary of balance for matched data')
chisq.test(rbind(table(data.match3$出院方式[data.match3$ITEM_NAME==1]),table(data.match2$出院方式[data.match2$ITEM_NAME==0])))
#p-value = 1


as.data.frame(
  data.match3 %>%
    group_by(ITEM_NAME) %>%
    select(one_of(ecls_cov)) %>%
    summarise_all(funs(mean(., na.rm = T)))
)
lapply(ecls_cov, function(v) {
  t.test(data.match3[, v] ~ data.match3$ITEM_NAME)
})
 

你可能感兴趣的:(R语言完美实现PSM算法,完成医学倾向性匹配需求,附完整代码)