背景
TCGA临床数据直接用R代码画出三线表。

使用场景
输入数据是临床样品信息,例如TCGA提供的clinical information。
根据年龄、性别、种族、病毒感染、吸烟史等特征进行分类对比,可能发现疾病、癌症发病、转移、耐药跟这些特征的关系,例如发现亚洲非吸烟女性肺癌发病率高。
下载TCGA临床数据
或者从 TCGA 泛癌分析 提供的页面中,下载临床数据
https://gdc.cancer.gov/about-data/publications/pancanatlas
再用excel筛选LIHC的数据保存为“TCGA_LIHC_377.xlsx”
按列分成4组
即 Alive、Dead with tumor、Dead tumor free、Total。
library(dplyr)
library(openxlsx)
x = read.xlsx("TCGA_LIHC_377.xlsx")
#%>%,是管道操作符,将上一个函数的输出作为下一个函数的输入
#其实用for循环一个一个嵌套也能实现,就是写起来太烦
#用dplyr包的管道操作符,无论读写都变得清晰容易了
x$tumor_status
#挑出生存状态是“Alive”的行
x1 = x %>% filter(vital_status == "Alive") # 245
#挑出生存状态是“Dead”并且有tumor的行
x2 = x %>% filter(vital_status == "Dead" & tumor_status == "WITH TUMOR") # 80
x3 = x %>% filter(vital_status == "Dead" & tumor_status == "TUMOR FREE") # 43
#把没信息的行去掉,用于算总数
xx = x %>% filter(vital_status != "" & tumor_status != "") # 358
按项细分
性别
#还记得前面提取的x1是生存状态是“Alive”的行,把这些行按性别分组
s1 = x1 %>% group_by(gender) %>% summarise(alive=n())
s1
# # A tibble: 2 × 2
#   gender alive
#   <chr>  <int>
# 1 FEMALE    71
# 2 MALE     174
s2 = x2 %>% group_by(gender) %>% summarise(tumor=n())
s2
# # A tibble: 2 × 2
#   gender tumor
#   <chr>  <int>
# 1 FEMALE    31
# 2 MALE      49
s3 = x3 %>% group_by(gender) %>% summarise(tumor_free=n())
s3
# # A tibble: 2 × 2
#   gender tumor_free
#   <chr>       <int>
# 1 FEMALE         17
# 2 MALE           26
#依次合并s1、s2、s3
sex = full_join(s1, s2, by='gender') %>% full_join(s3, by='gender') %>% as.data.frame
sex
#   gender alive tumor tumor_free
# 1 FEMALE    71    31         17
# 2   MALE   174    49         26
rn = sex[,1]#取出第一列,后面作为行名
sex = sex[,-1]#先把不需要计算的第一列删除
#计算p value
sex.p = chisq.test(sex)$p.value
print(sex.p)
#算总数
sex$total = rowSums(sex)
cs = colSums(sex)
#算百分比、写到括号里
#用sprintf格式化为小数点后保留1位,加上百分号
#用paste0加到个数后面,默认是sep="",比paste代码更简洁
sex <- rbind(paste0(sex[1,], " (", sprintf("%1.1f\\%%", sex[1,]/cs*100), ")"),
    paste0(sex[2,], " (", sprintf("%1.1f\\%%", sex[2,]/cs*100), ")"))
#加上行名、列名
rownames(sex) = rn
colnames(sex) = paste0(c("Alive", "Dead with tumor", "Dead tumor free", "Total"), 
                "\n(n=", cs, ")")
print(sex)
#        Alive\n(n=245)  Dead with tumor\n(n=80) Dead tumor free\n(n=43)
# FEMALE "71 (29.0\\%)"  "31 (38.8\\%)"          "17 (39.5\\%)"     
# MALE   "174 (71.0\\%)" "49 (61.3\\%)"          "26 (60.5\\%)"     
#        Total\n(n=368) 
# FEMALE "119 (32.3\\%)"
# MALE   "249 (67.7\\%)"
年龄
#先写个函数,用到dplyr包
#有些年龄数据缺失,需要去掉
age_stats <- function(x) {
   res <- x %>% summarise(age = round(mean(age_at_initial_pathologic_diagnosis,na.rm = T), 1), 
                sd=round(sd(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
                median=round(median(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
                min=round(min(age_at_initial_pathologic_diagnosis,na.rm = T), 1),
                max=round(max(age_at_initial_pathologic_diagnosis,na.rm = T), 1)
                )
   c("Mean (SD)" = with(res, paste0(age, " (", sd, ")")),
       "Median [MIN, MAX]" = with(res, paste0(median, " [", min, ",", max, "]"))#,
   )         
}
a1 = age_stats(x1)
a1
    #     Mean (SD) Median [MIN, MAX] 
    # "58.2 (13.3)"      "60 [16,84]"
a2 = age_stats(x2)
a3 = age_stats(x3)
aa = age_stats(xx)
#依次合并a1、a2、a3、aa
age = cbind(a1, a2) %>% cbind(a3) %>% cbind(aa)
colnames(age) = colnames(sex)
print(age)
#                   Alive\n(n=245) Dead with tumor\n(n=80)
# Mean (SD)         "58.2 (13.3)"  "60.4 (13.6)"      
# Median [MIN, MAX] "60 [16,84]"   "60.5 [18,90]"     
#                   Dead tumor free\n(n=43) Total\n(n=368)
# Mean (SD)         "64.8 (13.1)"           "59.6 (13.1)" 
# Median [MIN, MAX] "67 [35,85]"            "61 [16,90]" 
Stage
# x$ajcc_pathologic_tumor_stage
stage_stats <- function(x) {
    x %>% filter(ajcc_pathologic_tumor_stage != "[Not Available]") %>% 
        filter(ajcc_pathologic_tumor_stage != "[Discrepancy]") %>% 
        group_by(ajcc_pathologic_tumor_stage) %>% summarise(stage = n())
}
sg1 = stage_stats(x1)
sg1
# A tibble: 8 × 2
#   ajcc_pathologic_tumor_stage stage
#   <chr>                       <int>
# 1 Stage I                       131
# 2 Stage II                       61
# 3 Stage III                       1
# 4 Stage IIIA                     31
# 5 Stage IIIB                      5
# 6 Stage IIIC                      4
# 7 Stage IV                        1
# 8 Stage IVA                       1
sg2 = stage_stats(x2)
sg3 = stage_stats(x3)
sgx = stage_stats(xx)
sg = full_join(sg1, sg2, by="ajcc_pathologic_tumor_stage") %>% 
    full_join(sg3, by="ajcc_pathologic_tumor_stage") %>% 
    full_join(sgx, by="ajcc_pathologic_tumor_stage") %>% 
    as.data.frame
rownames(sg) = sg[,1]
sg = sg[,-1]
colnames(sg) = colnames(sex)
# chisq test
# total列不用于计算,删掉它
sgx <- sg[, -4]
# 有些stage在分组里没人,是NA,不用于计算p value
sgx <- sgx[!apply(sgx, 1, anyNA),]
# 计算p value
sg.p = chisq.test(sgx)$p.value
# NA的啥都不打印
sgv2 = lapply(1:nrow(sg), function(i) ifelse(is.na(sg[i,]), "", 
    paste0(sg[i,], " (", sprintf("%1.1f\\%%", sg[i,]/cs * 100), ")"))) %>% 
    do.call(rbind, .)
#或者用下面这种,打印“NA”字样
#sgv2 = lapply(1:nrow(sg), function(i) ifelse(is.na(sg[i,]), NA, 
#    paste0(sg[i,], " (", sprintf("%1.1f\\%%", sg[i,]/cs * 100), ")"))) %>% 
#    do.call(rbind, .)
rownames(sgv2) = rownames(sg)
colnames(sgv2) = colnames(sg)
sgv2
#            Alive\n(n=245)  Dead with tumor\n(n=80) Dead tumor free\n(n=43)
# Stage I    "131 (53.5\\%)" "21 (26.2\\%)"          "19 (44.2\\%)"     
# Stage II   "61 (24.9\\%)"  "13 (16.2\\%)"          "11 (25.6\\%)"     
# Stage III  "1 (0.4\\%)"    "1 (1.2\\%)"            ""                 
# Stage IIIA "31 (12.7\\%)"  "24 (30.0\\%)"          "10 (23.3\\%)"     
# Stage IIIB "5 (2.0\\%)"    "3 (3.8\\%)"            ""                 
# Stage IIIC "4 (1.6\\%)"    "3 (3.8\\%)"            "2 (4.7\\%)"       
# Stage IV   "1 (0.4\\%)"    ""                      ""                 
# Stage IVA  "1 (0.4\\%)"    ""                      ""                 
# Stage IVB  ""              "2 (2.5\\%)"            ""                 
#            Total\n(n=368) 
# Stage I    "167 (45.4\\%)"
# Stage II   "82 (22.3\\%)" 
# Stage III  "2 (0.5\\%)"   
# Stage IIIA "64 (17.4\\%)" 
# Stage IIIB "8 (2.2\\%)"   
# Stage IIIC "9 (2.4\\%)"   
# Stage IV   "1 (0.3\\%)"   
# Stage IVA  ""         
# Stage IVB  "2 (0.5\\%)"
合并
res = rbind(sex, age) %>% rbind(sgv2) %>% as.data.frame
print(res)
