背景
TCGA临床数据直接用R代码画出三线表。

使用场景
输入数据是临床样品信息,例如TCGA提供的clinical information。
根据年龄、性别、种族、病毒感染、吸烟史等特征进行分类对比,可能发现疾病、癌症发病、转移、耐药跟这些特征的关系,例如发现亚洲非吸烟女性肺癌发病率高。
下载TCGA临床数据
或者从 TCGA 泛癌分析 提供的页面中,下载临床数据
https://gdc.cancer.gov/about-data/publications/pancanatlas
再用excel筛选LIHC的数据保存为“TCGA_LIHC_377.xlsx”
按列分成4组
即 Alive
、Dead with tumor
、Dead tumor free
、Total
。
library(dplyr)
library(openxlsx)
x = read.xlsx("TCGA_LIHC_377.xlsx")
#%>%,是管道操作符,将上一个函数的输出作为下一个函数的输入
#其实用for循环一个一个嵌套也能实现,就是写起来太烦
#用dplyr包的管道操作符,无论读写都变得清晰容易了
x$tumor_status
#挑出生存状态是“Alive”的行
x1 = x %>% filter(vital_status == "Alive") # 245
#挑出生存状态是“Dead”并且有tumor的行
x2 = x %>% filter(vital_status == "Dead" & tumor_status == "WITH TUMOR") # 80
x3 = x %>% filter(vital_status == "Dead" & tumor_status == "TUMOR FREE") # 43
#把没信息的行去掉,用于算总数
xx = x %>% filter(vital_status != "" & tumor_status != "") # 358
按项细分
性别
#还记得前面提取的x1是生存状态是“Alive”的行,把这些行按性别分组
s1 = x1 %>% group_by(gender) %>% summarise(alive=n())
s1
# # A tibble: 2 × 2
# gender alive
# <chr> <int>
# 1 FEMALE 71
# 2 MALE 174
s2 = x2 %>% group_by(gender) %>% summarise(tumor=n())
s2
# # A tibble: 2 × 2
# gender tumor
# <chr> <int>
# 1 FEMALE 31
# 2 MALE 49
s3 = x3 %>% group_by(gender) %>% summarise(tumor_free=n())
s3
# # A tibble: 2 × 2
# gender tumor_free
# <chr> <int>
# 1 FEMALE 17
# 2 MALE 26
#依次合并s1、s2、s3
sex = full_join(s1, s2, by='gender') %>% full_join(s3, by='gender') %>% as.data.frame
sex
# gender alive tumor tumor_free
# 1 FEMALE 71 31 17
# 2 MALE 174 49 26
rn = sex[,1]#取出第一列,后面作为行名
sex = sex[,-1]#先把不需要计算的第一列删除
#计算p value
sex.p = chisq.test(sex)$p.value
print(sex.p)
#算总数
sex$total = rowSums(sex)
cs = colSums(sex)
#算百分比、写到括号里
#用sprintf格式化为小数点后保留1位,加上百分号
#用paste0加到个数后面,默认是sep="",比paste代码更简洁
sex <- rbind(paste0(sex[1,], " (", sprintf("%1.1f\\%%", sex[1,]/cs*100), ")"),
paste0(sex[2,], " (", sprintf("%1.1f\\%%", sex[2,]/cs*100), ")"))
#加上行名、列名
rownames(sex) = rn
colnames(sex) = paste0(c("Alive", "Dead with tumor", "Dead tumor free", "Total"),
"\n(n=", cs, ")")
print(sex)
# Alive\n(n=245) Dead with tumor\n(n=80) Dead tumor free\n(n=43)
# FEMALE "71 (29.0\\%)" "31 (38.8\\%)" "17 (39.5\\%)"
# MALE "174 (71.0\\%)" "49 (61.3\\%)" "26 (60.5\\%)"
# Total\n(n=368)
# FEMALE "119 (32.3\\%)"
# MALE "249 (67.7\\%)"
年龄
#先写个函数,用到dplyr包
#有些年龄数据缺失,需要去掉
age_stats <- function(x) {
res <- x %>% summarise(age = round(mean(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
sd=round(sd(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
median=round(median(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
min=round(min(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
max=round(max(age_at_initial_pathologic_diagnosis,na.rm = T), 1)
)
c("Mean (SD)" = with(res, paste0(age, " (", sd, ")")),
"Median [MIN, MAX]" = with(res, paste0(median, " [", min, ",", max, "]"))#,
)
}
a1 = age_stats(x1)
a1
# Mean (SD) Median [MIN, MAX]
# "58.2 (13.3)" "60 [16,84]"
a2 = age_stats(x2)
a3 = age_stats(x3)
aa = age_stats(xx)
#依次合并a1、a2、a3、aa
age = cbind(a1, a2) %>% cbind(a3) %>% cbind(aa)
colnames(age) = colnames(sex)
print(age)
# Alive\n(n=245) Dead with tumor\n(n=80)
# Mean (SD) "58.2 (13.3)" "60.4 (13.6)"
# Median [MIN, MAX] "60 [16,84]" "60.5 [18,90]"
# Dead tumor free\n(n=43) Total\n(n=368)
# Mean (SD) "64.8 (13.1)" "59.6 (13.1)"
# Median [MIN, MAX] "67 [35,85]" "61 [16,90]"
Stage
# x$ajcc_pathologic_tumor_stage
stage_stats <- function(x) {
x %>% filter(ajcc_pathologic_tumor_stage != "[Not Available]") %>%
filter(ajcc_pathologic_tumor_stage != "[Discrepancy]") %>%
group_by(ajcc_pathologic_tumor_stage) %>% summarise(stage = n())
}
sg1 = stage_stats(x1)
sg1
# A tibble: 8 × 2
# ajcc_pathologic_tumor_stage stage
# <chr> <int>
# 1 Stage I 131
# 2 Stage II 61
# 3 Stage III 1
# 4 Stage IIIA 31
# 5 Stage IIIB 5
# 6 Stage IIIC 4
# 7 Stage IV 1
# 8 Stage IVA 1
sg2 = stage_stats(x2)
sg3 = stage_stats(x3)
sgx = stage_stats(xx)
sg = full_join(sg1, sg2, by="ajcc_pathologic_tumor_stage") %>%
full_join(sg3, by="ajcc_pathologic_tumor_stage") %>%
full_join(sgx, by="ajcc_pathologic_tumor_stage") %>%
as.data.frame
rownames(sg) = sg[,1]
sg = sg[,-1]
colnames(sg) = colnames(sex)
# chisq test
# total列不用于计算,删掉它
sgx <- sg[, -4]
# 有些stage在分组里没人,是NA,不用于计算p value
sgx <- sgx[!apply(sgx, 1, anyNA),]
# 计算p value
sg.p = chisq.test(sgx)$p.value
# NA的啥都不打印
sgv2 = lapply(1:nrow(sg), function(i) ifelse(is.na(sg[i,]), "",
paste0(sg[i,], " (", sprintf("%1.1f\\%%", sg[i,]/cs * 100), ")"))) %>%
do.call(rbind, .)
#或者用下面这种,打印“NA”字样
#sgv2 = lapply(1:nrow(sg), function(i) ifelse(is.na(sg[i,]), NA,
# paste0(sg[i,], " (", sprintf("%1.1f\\%%", sg[i,]/cs * 100), ")"))) %>%
# do.call(rbind, .)
rownames(sgv2) = rownames(sg)
colnames(sgv2) = colnames(sg)
sgv2
# Alive\n(n=245) Dead with tumor\n(n=80) Dead tumor free\n(n=43)
# Stage I "131 (53.5\\%)" "21 (26.2\\%)" "19 (44.2\\%)"
# Stage II "61 (24.9\\%)" "13 (16.2\\%)" "11 (25.6\\%)"
# Stage III "1 (0.4\\%)" "1 (1.2\\%)" ""
# Stage IIIA "31 (12.7\\%)" "24 (30.0\\%)" "10 (23.3\\%)"
# Stage IIIB "5 (2.0\\%)" "3 (3.8\\%)" ""
# Stage IIIC "4 (1.6\\%)" "3 (3.8\\%)" "2 (4.7\\%)"
# Stage IV "1 (0.4\\%)" "" ""
# Stage IVA "1 (0.4\\%)" "" ""
# Stage IVB "" "2 (2.5\\%)" ""
# Total\n(n=368)
# Stage I "167 (45.4\\%)"
# Stage II "82 (22.3\\%)"
# Stage III "2 (0.5\\%)"
# Stage IIIA "64 (17.4\\%)"
# Stage IIIB "8 (2.2\\%)"
# Stage IIIC "9 (2.4\\%)"
# Stage IV "1 (0.3\\%)"
# Stage IVA ""
# Stage IVB "2 (0.5\\%)"
合并
res = rbind(sex, age) %>% rbind(sgv2) %>% as.data.frame
print(res)
