第 9 章 临床医学统计流
9.1 COMMON.R
9.1.1 加载R包和配置全局环境
## 1 加载R包和配置全局环境:----
install.packages("pacman")
library(pacman)
::p_load(purrr,tidyverse,scales,dplyr,flextable,gt,gtsummary,rstatix,forcats,janitor,table1,tableone,arrow,
pacman
data.table,datacleanr,stringr,tidyr,lubridate,openxlsx)# Global options
options(scipen = 1000)
options(scipen = 1, digits = 2)
options(encoding = 'UTF-8')
# getOption("digits")
Sys.setlocale("LC_ALL","Chinese")
9.1.3 Table of content
<- fp_par(text.align = "center",padding = 10)
center_par <- shortcuts$fp_bold(font.size = 20)
bold_face <- fpar(
toc ftext("目 录", prop = bold_face ),
fp_p = center_par)
9.1.4 Cover template -封面
## 基于officer包生成项目封面:
<- fp_text(font.size = 25, bold = TRUE,
bold_face1 font.family = "Times New Roma")
<- fp_text(font.size = 20, font.family = "宋体")
bold_face2
<- fpar(
cover
run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(),
# logo
external_img(here("figures", "palan.png"),
height = 1.045, width = 4.975),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(),
#TITLE
ftext("...",
prop = bold_face1),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(), run_linebreak(),
run_linebreak(), run_linebreak(), run_linebreak(),
# AUTHOR AND DATE
ftext("athorA", prop = bold_face2),
run_linebreak(),
ftext("athorB", prop = bold_face2),
run_linebreak(),
ftext("athorC", prop = bold_face2),
run_linebreak(), run_linebreak(),
ftext(paste("日期:",Sys.Date(),sep = ""),
prop = bold_face2),
fp_p = fp_par(text.align = "center"))
9.1.5 快捷辅助函数:
= function(data){
vi View(data)}
# vi(iris)
## 2.1.2 计算唯一患者人数:
= function(data){
pe return(n_distinct(data$patient_id))
}
## 2.1.3 转换格式:num/cha/fac/dat
## 需要注意as_date默认的时间设置时分秒为早晨八点;
= function(data,lab,funss){
tr if(funss == "num"){
return(
%>% mutate(across(.cols = lab, .fns = as.numeric)))
data else if(funss == "dat"){
}return(
%>% mutate(across(.cols = lab, .fns = as_date)))
data else if(funss == "cha"){
}return(
%>% mutate(across(.cols = lab, .fns = as.character)))
data else if(funss == "fac"){
}return(
%>% mutate(across(.cols = lab, .fns = as.factor)))
data else{
}return("input function fasle")
}
}
# iris %>% tr(.,"Sepal.Length","cha")
## 2.1.4 查询文件名:
= function(data){return(names(data))}
na
## 2.1.5转化频次
= function(data){
fr = as.numeric(data)
data return(
paste0(round(data*100,2),"%",sep="")
)
}# fr(5)
## 2.1.6 unique():
= function(data){return(unique(data))}
un
## 2.1.7 计算平均值和标准差:
## 需要注意是全局已经设置小数为2位;
## data;输入数据,可适配tidyvrse:
## select1: 输入数据,并提取指定行计算平均值和标准差:
= function(data,select1){
mean_sd %>% select({{select1}}) %>%
data summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
}# iris %>% ms(.,Sepal.Length) %>% as.data.frame()
= function(data,select1){
mean_sd_union = data %>% select({{select1}}) %>%
t1 summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
return(paste0(t1$mean,"(",t1$sd,")"))
}
## 2.1.8 人数比例:
## 分母:denomination:den
= function(data,den){
pop_percent =list()
out $pat = pe(data)
out$per = fr(pe(data)/pe(den))
outreturn(out)
}
= function(data,den){
pop_per_union = paste0(pe(data),"(",fr(pe(data)/pe(den)),")")
union_t return(union_t)
}## prec_bp %>% pp(.,pe(prec))
## 2.1.9 round(2):
= function(data,n=2){
round2 %>%
data mutate(across(where(is.numeric),~ round(.,n)))
}
## 2.1.10 统计visit_id的数量:
= function(data){
vis %>% select(visit_id) %>% n_distinct()
data }
9.1.6 快捷统计函数:
## 2.1.7 计算平均值和标准差:
## 需要注意是全局已经设置小数为2位;
## data;输入数据,可适配tidyvrse:
## select1: 输入数据,并提取指定行计算平均值和标准差:
= function(data,select1){
mean_sd %>% select({{select1}}) %>%
data summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
}# iris %>% ms(.,Sepal.Length) %>% as.data.frame()
## 将平均值和标准差以括号进行合并:
= function(data,select1){
mean_sd_union = data %>% select({{select1}}) %>%
t1 summarise(mean = mean({{select1}}),
sd = sd({{select1}})) %>%
mutate(across(where(is.numeric),~ round(.,2)))
return(paste0(t1$mean,"(",t1$sd,")"))}
## 2.1.8 人数比例:
## 分母:denomination:den
= function(data,den){
pop_percent =list()
out $pat = pe(data)
out$per = fr(pe(data)/pe(den))
outreturn(out)
}
= function(data,den){
pop_per_union = paste0(pe(data),"(",fr(pe(data)/pe(den)),")")
union_t return(union_t)
}## prec_bp %>% pp(.,pe(prec))
9.1.7 描述性分析统计函数
9.1.7.1 年龄计算
## birthDate: 出生日期:
## refDate: 默认为当前时间,项目需要index_date:
## 输出患者年龄:
<- function(birthDate, refDate = Sys.Date(), unit = "year") {
calc_age require(lubridate)
if (grepl(x = unit, pattern = "year")) {
as.period(interval(birthDate, refDate), unit = 'year')$year
else if (grepl(x = unit, pattern = "month")) {
} as.period(interval(birthDate, refDate), unit = 'month')$month
else if (grepl(x = unit, pattern = "week")) {
} floor(as.period(interval(birthDate, refDate), unit = 'day')$day / 7)
else if (grepl(x = unit, pattern = "day")) {
} as.period(interval(birthDate, refDate), unit = 'day')$day
else {
} print("Argument 'unit' must be one of 'year', 'month', 'week', or 'day'")
NA }}
9.2 Rawdata cleaning pipeline
## 常用线性清理范式(1)
<- linelist_raw %>%
linelist # standardize column name syntax
::clean_names() %>%
janitor# define class
mutate(across(contains("date"), as.Date),
generation = as.numeric(generation),
age = as.numeric(age)) %>%
# mutation
mutate(days_onset_hosp = as.numeric(date_hospitalisation - date_onset)) %>%
mutate(hospital = recode(hospital,
"Mitylira Hopital" = "Military Hospital",
"other" = "Other")) %>%
mutate(hospital = replace_na(hospital, "Missing")) %>%
mutate(age_years = case_when(
== "years" ~ age,
age_unit == "months" ~ age/12,
age_unit is.na(age_unit) ~ age,
TRUE ~ NA_real_)) %>%
mutate(
age_cat = epikit::age_categories(age_years, breakers = c(0, 5, 10, 15, 20, 30, 50, 70))) %>%
9.3 target_groups
9.3.1 加载集成数据集
9.3.1.1 Vroom
library(here)
source(here("R", "common.R"), encoding = "utf-8")
<- fs::dir_ls(here("data", "preprocess", "patient_clean"),
patient_filepath glob = "*/*.csv")
<- fs::dir_ls(here("data", "preprocess", "diagnosis_clean"),
diagnosis_filepath glob = "*/*.csv")
<- vroom(patient_filepath) %>%
patient select(-"...1") %>%
distinct()
<- vroom(diagnosis_filepath) %>%
diagnosis select(-"...1") %>%
distinct()
9.3.2 保存数据集
Make sure only patient_id, visit_id and provider_id are kept in the end.
<- diagnosis %>%
overall_target filter(patient_type == "住院患者") %>%
filter(!sub_group == "other")
<- overall_target %>%
target_group_1 filter(sub_group == "非内分泌科就诊") %>%
left_join(select(visit, patient_id, visit_id, admission_datetime)) %>%
group_by(patient_id) %>%
arrange(admission_datetime) %>%
slice_head() %>%
ungroup() %>%
select(patient_id, visit_id, provider_id) %>%
distinct()
9.3.3 患者平衡匹配前后信息比较
This part is for comparing the differences in baseline characteristics of the dataset after using PSM. Make sure only baseline characteristics are kept in the end.
<- CreateTableOne(vars = c('年龄', '年龄分组', '性别',
table_psm_before '医院', '医保', '医院等级'),
data = data_psm_before,
strata = 'strata',
factorVars = c('年龄分组', '医保', '性别',
'医院等级'),
smd = TRUE
)
<- print(
table_psm_before
table_psm_before,printToggle = FALSE,
noSpaces = TRUE,
smd = TRUE # add smd
)
<- CreateTableOne(vars = c('年龄', '年龄分组', '性别',
table_psm_after '医院', '医保', '医院等级'),
data = data_psm_after,
strata = 'strata',
factorVars = c('年龄分组', '医保', '性别',
'医院等级'),
smd = TRUE
)
<- print(
table_psm_after
table_psm_before,printToggle = FALSE,
noSpaces = TRUE,
smd = TRUE # add smd
)
9.4 dataset_for_analysis
Here is an example:
library(here)
source(here("R", "prepare and analyse data", "lan", "target_groups.R"), encoding = "UTF-8")
<- group_endpoint3 %>%
data_endpoint3 left_join(patient) %>%
left_join(
select(visit, patient_id, visit_id, insurance_type)
%>%
) left_join(
select(lab, patient_id, visit_id, if_ctn, ctn, if_ckmb, ckmb, if_ck, ck)
%>%
) left_join(
select(diagnosis, patient_id, visit_id, "diag1", "diag1", "diag1",
"diag1", "diag1", "diag1", "diag1")
%>%
) left_join(
select(prescribing, patient_id, visit_id, "diag1", "diag1", "diag1",
"diag1", "diag1(ACEI)",
"diag1(ARB)", start_kangban, end_kangban,
kangban)%>%
) distinct() %>%
mutate_at(vars(if_ctn, if_ckmb, if_ck), ~ replace_na(., 0))
<- rbind(subgroup, overall) %>%
secondary5 apply_labels(fbg_diff = "空腹血糖变化差值",
fbg_reg = "末次空腹血糖<6.1m/mol",
age = "年龄",
sex = "性别",
insurance_type = "医疗保险",
region = "就诊地区",
cad = "冠心病",
hbp = "高血压",
stroke = "卒中",
ckd = "慢性肾功能不全",
disease_num = "合并疾病数目",
group_1 = "基础vs.预混",
treatment_plan = "糖尿病用药方案",
inpatient_time = "住院时长",
total_fee = "总花费",
trt_fee = "治疗费",
service_fee = "服务费",
sub_group = "亚组"
)
write_dataset(
data_endpoint3, here("data", "analysis", "lan", "endpoint3"),
format = "parquet")
9.5 final_analysis
此部分主要用于深层进阶分析 ## present 使用rmarkdown批量输出
---
output:
officedown::rdocx_document
---
::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
knitrlibrary(here)
source(here("R", "prepare and analyse data", "lan", "endpoint1.R"), encoding = "utf-8")
cover
\newpage
使用其进行换页
tocblock_toc()
endpoint one
%>%
table_1 as_flex_table() %>%
set_table_properties(layout = "autofit")
endpoint two
%>%
table_3 as_flex_table() %>%
set_table_properties(layout = "autofit")
endpoint three
%>%
table_4 as_flex_table() %>%
set_table_properties(layout = "autofit")