第 9 章 临床医学统计流
9.1 COMMON.R
9.1.1 加载R包和配置全局环境
## 1 加载R包和配置全局环境:----
install.packages("pacman")
library(pacman)
pacman::p_load(purrr,tidyverse,scales,dplyr,flextable,gt,gtsummary,rstatix,forcats,janitor,table1,tableone,arrow,
data.table,datacleanr,stringr,tidyr,lubridate,openxlsx)
# Global options
options(scipen = 1000)
options(scipen = 1, digits = 2)
options(encoding = 'UTF-8')
# getOption("digits")
Sys.setlocale("LC_ALL","Chinese")9.1.3 Table of content
center_par <- fp_par(text.align = "center",padding = 10)
bold_face <- shortcuts$fp_bold(font.size = 20)
toc <- fpar(
ftext("目 录", prop = bold_face ),
fp_p = center_par)9.1.4 Cover template -封面
## 基于officer包生成项目封面:
bold_face1 <- fp_text(font.size = 25, bold = TRUE,
font.family = "Times New Roma")
bold_face2 <- fp_text(font.size = 20, font.family = "宋体")
cover <- fpar(
run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(),
# logo
external_img(here("figures", "palan.png"),
height = 1.045, width = 4.975),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(),
#TITLE
ftext("...",
prop = bold_face1),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(),
# AUTHOR AND DATE
ftext("athorA", prop = bold_face2),
run_linebreak(),
ftext("athorB", prop = bold_face2),
run_linebreak(),
ftext("athorC", prop = bold_face2),
run_linebreak(), run_linebreak(),
ftext(paste("日期:",Sys.Date(),sep = ""),
prop = bold_face2),
fp_p = fp_par(text.align = "center"))9.1.5 快捷辅助函数:
vi = function(data){
View(data)}
# vi(iris)
## 2.1.2 计算唯一患者人数:
pe = function(data){
return(n_distinct(data$patient_id))
}
## 2.1.3 转换格式:num/cha/fac/dat
## 需要注意as_date默认的时间设置时分秒为早晨八点;
tr = function(data,lab,funss){
if(funss == "num"){
return(
data %>% mutate(across(.cols = lab, .fns = as.numeric)))
}else if(funss == "dat"){
return(
data %>% mutate(across(.cols = lab, .fns = as_date)))
}else if(funss == "cha"){
return(
data %>% mutate(across(.cols = lab, .fns = as.character)))
}else if(funss == "fac"){
return(
data %>% mutate(across(.cols = lab, .fns = as.factor)))
}else{
return("input function fasle")
}
}
# iris %>% tr(.,"Sepal.Length","cha")
## 2.1.4 查询文件名:
na = function(data){return(names(data))}
## 2.1.5转化频次
fr = function(data){
data = as.numeric(data)
return(
paste0(round(data*100,2),"%",sep="")
)
}
# fr(5)
## 2.1.6 unique():
un = function(data){return(unique(data))}
## 2.1.7 计算平均值和标准差:
## 需要注意是全局已经设置小数为2位;
## data;输入数据,可适配tidyvrse:
## select1: 输入数据,并提取指定行计算平均值和标准差:
mean_sd = function(data,select1){
data %>% select({{select1}}) %>%
summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
}
# iris %>% ms(.,Sepal.Length) %>% as.data.frame()
mean_sd_union = function(data,select1){
t1 = data %>% select({{select1}}) %>%
summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
return(paste0(t1$mean,"(",t1$sd,")"))
}
## 2.1.8 人数比例:
## 分母:denomination:den
pop_percent = function(data,den){
out =list()
out$pat = pe(data)
out$per = fr(pe(data)/pe(den))
return(out)
}
pop_per_union = function(data,den){
union_t = paste0(pe(data),"(",fr(pe(data)/pe(den)),")")
return(union_t)
}
## prec_bp %>% pp(.,pe(prec))
## 2.1.9 round(2):
round2 = function(data,n=2){
data %>%
mutate(across(where(is.numeric),~ round(.,n)))
}
## 2.1.10 统计visit_id的数量:
vis = function(data){
data %>% select(visit_id) %>% n_distinct()
}9.1.6 快捷统计函数:
## 2.1.7 计算平均值和标准差:
## 需要注意是全局已经设置小数为2位;
## data;输入数据,可适配tidyvrse:
## select1: 输入数据,并提取指定行计算平均值和标准差:
mean_sd = function(data,select1){
data %>% select({{select1}}) %>%
summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
}
# iris %>% ms(.,Sepal.Length) %>% as.data.frame()
## 将平均值和标准差以括号进行合并:
mean_sd_union = function(data,select1){
t1 = data %>% select({{select1}}) %>%
summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
return(paste0(t1$mean,"(",t1$sd,")"))}
## 2.1.8 人数比例:
## 分母:denomination:den
pop_percent = function(data,den){
out =list()
out$pat = pe(data)
out$per = fr(pe(data)/pe(den))
return(out)
}
pop_per_union = function(data,den){
union_t = paste0(pe(data),"(",fr(pe(data)/pe(den)),")")
return(union_t)
}
## prec_bp %>% pp(.,pe(prec))9.1.7 描述性分析统计函数
9.1.7.1 年龄计算
## birthDate: 出生日期:
## refDate: 默认为当前时间,项目需要index_date:
## 输出患者年龄:
calc_age <- function(birthDate, refDate = Sys.Date(), unit = "year") {
require(lubridate)
if (grepl(x = unit, pattern = "year")) {
as.period(interval(birthDate, refDate), unit = 'year')$year
} else if (grepl(x = unit, pattern = "month")) {
as.period(interval(birthDate, refDate), unit = 'month')$month
} else if (grepl(x = unit, pattern = "week")) {
floor(as.period(interval(birthDate, refDate), unit = 'day')$day / 7)
} else if (grepl(x = unit, pattern = "day")) {
as.period(interval(birthDate, refDate), unit = 'day')$day
} else {
print("Argument 'unit' must be one of 'year', 'month', 'week', or 'day'")
NA }}9.2 Rawdata cleaning pipeline
## 常用线性清理范式(1)
linelist <- linelist_raw %>%
# standardize column name syntax
janitor::clean_names() %>%
# define class
mutate(across(contains("date"), as.Date),
generation = as.numeric(generation),
age = as.numeric(age)) %>%
# mutation
mutate(days_onset_hosp = as.numeric(date_hospitalisation - date_onset)) %>%
mutate(hospital = recode(hospital,
"Mitylira Hopital" = "Military Hospital",
"other" = "Other")) %>%
mutate(hospital = replace_na(hospital, "Missing")) %>%
mutate(age_years = case_when(
age_unit == "years" ~ age,
age_unit == "months" ~ age/12,
is.na(age_unit) ~ age,
TRUE ~ NA_real_)) %>%
mutate(
age_cat = epikit::age_categories(age_years, breakers = c(0, 5, 10, 15, 20, 30, 50, 70))) %>% 9.3 target_groups
9.3.1 加载集成数据集
9.3.1.1 Vroom
library(here)
source(here("R", "common.R"), encoding = "utf-8")
patient_filepath <- fs::dir_ls(here("data", "preprocess", "patient_clean"),
glob = "*/*.csv")
diagnosis_filepath <- fs::dir_ls(here("data", "preprocess", "diagnosis_clean"),
glob = "*/*.csv")
patient <- vroom(patient_filepath) %>%
select(-"...1") %>%
distinct()
diagnosis <- vroom(diagnosis_filepath) %>%
select(-"...1") %>%
distinct()
9.3.2 保存数据集
Make sure only patient_id, visit_id and provider_id are kept in the end.
overall_target <- diagnosis %>%
filter(patient_type == "住院患者") %>%
filter(!sub_group == "other")
target_group_1 <- overall_target %>%
filter(sub_group == "非内分泌科就诊") %>%
left_join(select(visit, patient_id, visit_id, admission_datetime)) %>%
group_by(patient_id) %>%
arrange(admission_datetime) %>%
slice_head() %>%
ungroup() %>%
select(patient_id, visit_id, provider_id) %>%
distinct()
9.3.3 患者平衡匹配前后信息比较
This part is for comparing the differences in baseline characteristics of the dataset after using PSM. Make sure only baseline characteristics are kept in the end.
table_psm_before <- CreateTableOne(vars = c('年龄', '年龄分组', '性别',
'医院', '医保', '医院等级'),
data = data_psm_before,
strata = 'strata',
factorVars = c('年龄分组', '医保', '性别',
'医院等级'),
smd = TRUE
)
table_psm_before <- print(
table_psm_before,
printToggle = FALSE,
noSpaces = TRUE,
smd = TRUE # add smd
)
table_psm_after <- CreateTableOne(vars = c('年龄', '年龄分组', '性别',
'医院', '医保', '医院等级'),
data = data_psm_after,
strata = 'strata',
factorVars = c('年龄分组', '医保', '性别',
'医院等级'),
smd = TRUE
)
table_psm_after <- print(
table_psm_before,
printToggle = FALSE,
noSpaces = TRUE,
smd = TRUE # add smd
)9.4 dataset_for_analysis
Here is an example:
library(here)
source(here("R", "prepare and analyse data", "lan", "target_groups.R"), encoding = "UTF-8")
data_endpoint3 <- group_endpoint3 %>%
left_join(patient) %>%
left_join(
select(visit, patient_id, visit_id, insurance_type)
) %>%
left_join(
select(lab, patient_id, visit_id, if_ctn, ctn, if_ckmb, ckmb, if_ck, ck)
) %>%
left_join(
select(diagnosis, patient_id, visit_id, "diag1", "diag1", "diag1",
"diag1", "diag1", "diag1", "diag1")
) %>%
left_join(
select(prescribing, patient_id, visit_id, "diag1", "diag1", "diag1",
"diag1", "diag1(ACEI)",
"diag1(ARB)", start_kangban, end_kangban,
kangban)
) %>%
distinct() %>%
mutate_at(vars(if_ctn, if_ckmb, if_ck), ~ replace_na(., 0))
secondary5 <- rbind(subgroup, overall) %>%
apply_labels(fbg_diff = "空腹血糖变化差值",
fbg_reg = "末次空腹血糖<6.1m/mol",
age = "年龄",
sex = "性别",
insurance_type = "医疗保险",
region = "就诊地区",
cad = "冠心病",
hbp = "高血压",
stroke = "卒中",
ckd = "慢性肾功能不全",
disease_num = "合并疾病数目",
group_1 = "基础vs.预混",
treatment_plan = "糖尿病用药方案",
inpatient_time = "住院时长",
total_fee = "总花费",
trt_fee = "治疗费",
service_fee = "服务费",
sub_group = "亚组"
)
write_dataset(
data_endpoint3,
here("data", "analysis", "lan", "endpoint3"),
format = "parquet")9.5 final_analysis
此部分主要用于深层进阶分析 ## present 使用rmarkdown批量输出
---
output:
officedown::rdocx_document
---
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(here)
source(here("R", "prepare and analyse data", "lan", "endpoint1.R"), encoding = "utf-8")cover\newpage使用其进行换页
toc
block_toc()endpoint one
table_1 %>%
as_flex_table() %>%
set_table_properties(layout = "autofit")endpoint two
table_3 %>%
as_flex_table() %>%
set_table_properties(layout = "autofit")endpoint three
table_4 %>%
as_flex_table() %>%
set_table_properties(layout = "autofit")