bioinfoer

找回密码
立即注册
搜索
热搜: 活动 交友 discuz
发新帖

0

收听

12

听众

341

主题
发表于 2021-10-20 08:48:04 | 查看: 5451| 回复: 3
本帖最后由 生信喵 于 2021-10-20 08:52 编辑

第一步:安装所需的R包
  1. rm(list = ls()) #### 魔幻操作,一键清空~
  2. getwd()
  3. setwd('/home/xhs/helix/level2/L64_Codes/6_4_WGCNA_Analysis/')
  4. # 设置国内镜像,安装时运行一次即可
  5. options("repos"="https://mirrors.ustc.edu.cn/CRAN/")

  6. if(!"BiocManager"%in%installed.packages())
  7. install.packages("BiocManager",update = F,ask = F)

  8. options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")

  9. # 如果使用R4.0版本需要安装Rtools40
  10. # 下载网站https://cran.r-project.org/bin/windows/Rtools/

  11. # 安装GO.db
  12. if(!"GO.db"%in%installed.packages())
  13. BiocManager::install("GO.db")

  14. # 安装flashClust
  15. if(!"flashClust"%in%installed.packages())
  16. BiocManager::install("flashClust")

  17. # 安装WGCNA
  18. if(!"WGCNA"%in%installed.packages())
  19. BiocManager::install("WGCNA")

  20. # 安装org.Hs.eg.db
  21. if(!"org.Hs.eg.db"%in%installed.packages())
  22. BiocManager::install("org.Hs.eg.db")

  23. # 安装clusterProfiler
  24. if(!"clusterProfiler"%in%installed.packages())
  25. BiocManager::install("clusterProfiler")

  26. # 安装最新版rlang
  27. BiocManager::install("rlang")

  28. # 安装富集分析包
  29. source("https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/GeneAnnotation/installAnRichment.R");
  30. # 上面无法加载成功时,用以下代码本地加载
  31. source("installAnRichment.R")
  32. installAnRichment()

  33. install.packages("anRichmentMethods_0.91-94.tar.gz", repos = NULL, type = "source")
  34. install.packages("anRichment_1.10-1.tar.gz", repos = NULL, type = "source")

  35. # 判断当前目录下是否存在data文件夹
  36. # 存在时,忽略
  37. # 不存在,创建data文件夹储存输入文件和结果文件
  38. if(!dir.exists("data")) dir.create("data")
  39. # 测试figures文件夹是否存在,
  40. # 存在时忽略,不存在时创建
  41. # figures文件夹储存所有的结果图片
  42. if(!dir.exists("figures")) dir.create("figures")

复制代码
第二步:整理数据
  1. rm(list = ls())
  2. library(WGCNA)

  3. # 读取基因表达矩阵数据
  4. fpkm = read.table("data/fpkm.txt", header = T, row.names = 1, check.names = F)
  5. head(fpkm)

  6. ### 选取基因方法 ###

  7. ## 第一种,通过标准差选择
  8. ## 计算每个基因的标准差
  9. fpkm_sd = apply(fpkm,1,sd)#1是对每一行,2是对每一列
  10. head(WGCNA_matrix)
  11. ## 使用标准差对基因进行降序排序
  12. fpkm_sd_sorted = order(fpkm_sd, decreasing = T)
  13. ## 选择前5000个标准差较大的基因
  14. fpkm_num = fpkm_sd_sorted[1:5000]
  15. ## 从表达矩阵提取基因
  16. fpkm_filter = fpkm[fpkm_num,]
  17. ## 对表达矩阵进行转置
  18. WGCNA_matrix = t(fpkm_filter)#变成行名是样本,列名是基因
  19. ## 保存过滤后的数据
  20. save(WGCNA_matrix, file = "data/Step01-fpkm_sd_filter.Rda")

  21. ## 第二种,使用绝对中位差选择,推荐使用绝对中位差
  22. WGCNA_matrix = t(fpkm[order(apply(fpkm,1,mad), decreasing = T)[1:5000],])#mad代表绝对中位差
  23. save(WGCNA_matrix, file = "data/Step01-fpkm_mad_filter.Rda")

  24. ## 第三种,使用全基因
  25. WGCNA_matrix = t(fpkm)
  26. save(WGCNA_matrix, file = "data/Step01-fpkm_allgene.Rda")


  27. ### 去除缺失值较多的基因/样本 ###
  28. rm(list = ls())
  29. # 加载表达矩阵
  30. load(file = "data/Step01-fpkm_mad_filter.Rda")
  31. # 加载WGCNA包
  32. library(WGCNA)
  33. # 判断是否缺失较多的样本和基因
  34. datExpr0 = WGCNA_matrix
  35. gsg = goodSamplesGenes(datExpr0, verbose = 3)

  36. # 是否需要过滤,TRUE表示不需要,FALSE表示需要
  37. gsg$allOK
  38. # 当gsg$allOK为TRUE,以下代码不会运行,为FALSE时,运行以下代码过滤样本
  39. if (!gsg$allOK)
  40. {
  41. # 打印移除的基因名和样本名
  42. if (sum(!gsg$goodGenes)>0)
  43. printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  44. if (sum(!gsg$goodSamples)>0)
  45. printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  46. # 提取保留的基因和样本
  47. datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
  48. }

  49. ### 通过样本聚类识别离群样本,去除离群样本 ###
  50. sampleTree = hclust(dist(datExpr0), method = "average");#使用hclust函数进行均值聚类
  51. # 绘制样本聚类图确定离群样本
  52. sizeGrWindow(30,9)
  53. pdf(file = "figures/Step01-sampleClustering.pdf", width = 30, height = 9);
  54. par(cex = 0.6);
  55. par(mar = c(0,4,2,0))
  56. plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
  57. cex.axis = 1.5, cex.main = 2)
  58. # 根据上图判断,需要截取的高度参数h
  59. abline(h = 120, col = "red")#在120的地方画条线
  60. dev.off()

  61. # 去除离群得聚类样本,cutHeight参数要与上述得h参数值一致
  62. clust = cutreeStatic(sampleTree, cutHeight = 120, minSize = 10)
  63. table(clust)
  64. # clust
  65. # 0 1 0就是要去除的,1就是要保存的
  66. # 15 162

  67. # clust 1聚类中包含着我们想要得样本,将其提取出来
  68. keepSamples = (clust==1)
  69. datExpr = datExpr0[keepSamples, ]

  70. # 记录基因和样本数,方便后续可视化
  71. nGenes = ncol(datExpr)#基因数
  72. nSamples = nrow(datExpr)#样本数
  73. save(datExpr, nGenes, nSamples,file = "data/Step01-WGCNA_input.Rda")
复制代码
第三步:一步法WGCNA
  1. # 清空所有变量
  2. rm(list = ls())
  3. # 加载包
  4. library(WGCNA)
  5. # 允许多线程运行
  6. enableWGCNAThreads()
  7. # 加载表达矩阵
  8. load("data/Step01-WGCNA_input.Rda")

  9. ### 选择软阈值 ###
  10. powers = c(c(1:10), seq(from = 12, to=20, by=2))
  11. # 进行网络拓扑分析
  12. sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)#β=power,就是软阈值

  13. # 可视化结果
  14. sizeGrWindow(9, 5)
  15. pdf(file = "figures/Step02-SoftThreshold.pdf", width = 9, height = 5);

  16. par(mfrow = c(1,2))#一个画板上,画两个图,一行两列
  17. cex1 = 0.9;
  18. # 无尺度网络阈值得选择
  19. plot(sft$fitIndices[,1],
  20. -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
  21. xlab="Soft Threshold (power)",#x轴
  22. ylab="Scale Free Topology Model Fit,signed R^2",type="n",#y轴
  23. main = paste("Scale independence"));#标题
  24. text(sft$fitIndices[,1],
  25. -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
  26. labels=powers,
  27. cex=cex1,
  28. col="red");

  29. # 用红线标出R^2的参考值
  30. abline(h=0.90,col="red")
  31. # 平均连接度
  32. plot(sft$fitIndices[,1],
  33. sft$fitIndices[,5],
  34. xlab="Soft Threshold (power)",
  35. ylab="Mean Connectivity",
  36. type="n",
  37. main = paste("Mean connectivity"))
  38. text(sft$fitIndices[,1],
  39. sft$fitIndices[,5],
  40. labels=powers,
  41. cex=cex1,
  42. col="red")

  43. dev.off()

  44. # 无尺度网络检验,验证构建的网络是否是无尺度网络
  45. softpower=sft$powerEstimate

  46. ADJ = abs(cor(datExpr,use="p"))^softpower#相关性取绝对值再幂次
  47. k = as.vector(apply(ADJ,2,sum,na.rm=T))#对ADJ的每一列取和,也就是频次

  48. pdf(file = "figures/Step02-scaleFree.pdf",width = 14)
  49. par(mfrow = c(1,2))

  50. hist(k)#直方图
  51. scaleFreePlot(k,main="Check scale free topology")

  52. dev.off()


  53. ### 一步构建网络 ###
  54. net = blockwiseModules(datExpr, #处理好的表达矩阵
  55. power = sft$powerEstimate,#选择的软阈值
  56. TOMType = "unsigned", #拓扑矩阵类型,none表示邻接矩阵聚类,unsigned最常用,构建无方向
  57. minModuleSize = 30,#网络模块包含的最少基因数
  58. reassignThreshold = 0, #模块间基因重分类的阈值
  59. mergeCutHeight = 0.25,#合并相异度低于0.25的模块
  60. numericLabels = TRUE, #true,返回模块的数字标签 false返回模块的颜色标签
  61. pamRespectsDendro = FALSE,#调用动态剪切树算法识别网络模块后,进行第二次的模块比较,合并相关性高的模块
  62. saveTOMs = TRUE,#保存拓扑矩阵
  63. saveTOMFileBase = "data/Step02-fpkmTOM",
  64. verbose = 3)#0,不反回任何信息,>0返回计算过程
  65. # 保存网络构建结果
  66. save(net, file = "data/Step02-One_step_net.Rda")

  67. # 加载网络构建结果
  68. load(file = "data/Step02-One_step_net.Rda")
  69. # 打开绘图窗口
  70. sizeGrWindow(12, 9)
  71. pdf(file = "figures/Step02-moduleCluster.pdf", width = 12, height = 9);
  72. # 将标签转化为颜色
  73. mergedColors = labels2colors(net$colors)
  74. # 绘制聚类和网络模块对应图
  75. plotDendroAndColors(dendro = net$dendrograms[[1]], #hclust函数生成的聚类结果
  76. colors = mergedColors[net$blockGenes[[1]]],#基因对应的模块颜色
  77. groupLabels = "Module colors",#分组标签
  78. dendroLabels = FALSE, #false,不显示聚类图的每个分支名称
  79. hang = 0.03,#调整聚类图分支所占的高度
  80. addGuide = TRUE, #为聚类图添加辅助线
  81. guideHang = 0.05,#辅助线所在高度
  82. main = "Gene dendrogram and module colors")
  83. dev.off()

  84. # 加载TOM矩阵
  85. load("data/Step02-fpkmTOM-block.1.RData")
  86. # 网络特征向量
  87. MEs = moduleEigengenes(datExpr, mergedColors)$eigengenes
  88. # 对特征向量排序
  89. MEs = orderMEs(MEs)
  90. # 可视化模块间的相关性
  91. sizeGrWindow(5,7.5);
  92. pdf(file = "figures/Step02-moduleCor.pdf", width = 5, height = 7.5);

  93. par(cex = 0.9)
  94. plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2),
  95. marHeatmap = c(3,4,1,2), cex.lab = 0.8,
  96. xLabelsAngle = 90)
  97. dev.off()
复制代码


发表于 2021-10-20 08:49:16
  1. ## TOMplot

  2. dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = sft$powerEstimate); #1-相关性=相异性

  3. nSelect = 400
  4. # 随机选取400个基因进行可视化,设置seed值,保证结果的可重复性
  5. set.seed(10);#设置随机种子数
  6. select = sample(nGenes, size = nSelect);#从5000个基因选择400个
  7. selectTOM = dissTOM[select, select];#选择这400*400的矩阵
  8. # 对选取的基因进行重新聚类
  9. selectTree = hclust(as.dist(selectTOM), method = "average")#用hclust重新聚类
  10. selectColors = mergedColors[select];#提取相应的颜色模块
  11. # 打开一个绘图窗口
  12. sizeGrWindow(9,9)
  13. pdf(file = "figures/Step02-TOMplot.pdf", width = 9, height = 9);
  14. # 美化图形的设置
  15. plotDiss = selectTOM^7;
  16. diag(plotDiss) = NA;
  17. # 绘制TOM图
  18. TOMplot(plotDiss, #拓扑矩阵,该矩阵记录了每个节点之间的相关性
  19. selectTree, #基因的聚类结果
  20. selectColors, #基因对应的模块颜色
  21. main = "Network heatmap plot, selected genes")
  22. dev.off()
复制代码
第四步:分步法WGCNA
  1. # 清空环境变量
  2. rm(list = ls())
  3. #加载R包
  4. library(WGCNA)
  5. # 允许多线程运行
  6. enableWGCNAThreads()
  7. # 加载表达矩阵
  8. load("data/Step01-WGCNA_input.Rda")

  9. ## 选择软阈值
  10. sizeGrWindow(9,5);
  11. par(mfrow = c(1,2));
  12. powers = c(c(1:10), seq(from = 12, to=20, by=2))
  13. RpowerTable=pickSoftThreshold(datExpr, powerVector=powers)[[2]]
  14. cex1=0.7
  15. pdf(file="figures/Step03-softThresholding.pdf",width = 14)
  16. par(mfrow=c(1,2))
  17. plot(RpowerTable[,1], -sign(RpowerTable[,3])*RpowerTable[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"))
  18. text(RpowerTable[,1], -sign(RpowerTable[,3])*RpowerTable[,2], labels=powers,cex=cex1,col="red")

  19. abline(h=0.9,col="red")
  20. plot(RpowerTable[,1], RpowerTable[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
  21. text(RpowerTable[,1], RpowerTable[,5], labels=powers, cex=cex1,col="red")
  22. dev.off()

  23. ## 无尺度网络验证
  24. softpower=7

  25. ADJ = abs(cor(datExpr,use="p"))^softpower
  26. k = as.vector(apply(ADJ,2,sum,na.rm=T))

  27. pdf(file = "figures/Step03-scaleFree.pdf",width = 14)
  28. par(mfrow = c(1,2))
  29. hist(k)
  30. scaleFreePlot(k,main="Check scale free topology")
  31. dev.off()

  32. ## 计算邻接矩阵
  33. adjacency = adjacency(datExpr,power=softpower)
  34. ## 计算TOM拓扑矩阵
  35. TOM = TOMsimilarity(adjacency)
  36. ## 计算相异度
  37. dissTOM = 1- TOM

  38. #模块初步聚类分析
  39. library(flashClust)
  40. geneTree = flashClust(as.dist(dissTOM),method="average")
  41. #绘制层次聚类树
  42. pdf(file = "figures/Step03-GeneClusterTOM-based.pdf")
  43. plot(geneTree,xlab="",sub="",main="Gene clustering on TOM-based",labels=FALSE,hang=0.04)
  44. dev.off()

  45. #构建初步基因模块
  46. #设定基因模块中至少30个基因
  47. minModuleSize=30
  48. # 动态剪切树识别网络模块
  49. dynamicMods = cutreeDynamic(dendro = geneTree,#hclust函数的聚类结果
  50. distM = dissTOM,#
  51. deepSplit = 2,
  52. pamRespectsDendro = FALSE,
  53. minClusterSize = minModuleSize)#设定基因模块中至少30个基因
  54. # 将标签转换为颜色
  55. dynamicColors = labels2colors(dynamicMods)
  56. table(dynamicColors)#看聚类到哪些模块,哪些颜色

  57. pdf(file="figures/Step03-DynamicTreeCut.pdf",width=9,height=5)
  58. plotDendroAndColors(dendro = geneTree,
  59. colors = dynamicColors,
  60. groupLabels = "Dynamic Tree Cut",
  61. dendroLabels = FALSE, hang = 0.03,
  62. addGuide = TRUE,
  63. main = "Gene dendrogram and module colors")
  64. dev.off()


  65. #前面用动态剪切树聚类了一些模块,现在要对这步结果进一步合并,合并相似度大于0.75的模块,降低网络的复杂度
  66. #计算基因模块特征向量
  67. MEList = moduleEigengenes(datExpr,colors=dynamicColors)#计算特征向量
  68. MEs = MEList$eigengenes;#提取特征向量
  69. MEDiss1 = 1-cor(MEs);#计算相异度
  70. METree1 = flashClust(as.dist(MEDiss1),method="average")#对相异度进行flashClust聚类
  71. #设置特征向量相关系数大于0.75
  72. MEDissThres = 0.25;#相异度在0.25以下,也就是相似度大于0.75,对这些模块合并
  73. #合并模块
  74. merge = mergeCloseModules(datExpr, #合并相似度大于0.75的模块
  75. dynamicColors,
  76. cutHeight = MEDissThres,
  77. verbose=3)
  78. mergedColors = merge$colors

  79. table(dynamicColors)#动态剪切树的模块颜色
  80. table(mergedColors)#合并后的模块颜色,可以看到从18个模块变成了14个模块

  81. mergedMEs = merge$newMEs#合并后的14个模块

  82. #重新命名合并后的模块
  83. moduleColors = mergedColors;
  84. colorOrder = c("grey",standardColors(50));
  85. moduleLabels = match(moduleColors,colorOrder)-1;
  86. MEs = mergedMEs;
  87. MEDiss2 = 1-cor(MEs);#计算相异度
  88. METree2 = flashClust(as.dist(MEDiss2),method="average");#对合并后的模块进行聚类

  89. #绘制聚类结果图
  90. pdf(file="figures/Step03-MECombined.pdf",width=12,height=5)
  91. par(mfrow=c(1,2))
  92. plot(METree1,xlab="",sub="",main="Clustering of ME before combined")# METree1是动态剪切树形成的模块
  93. abline(h=MEDissThres,col="red")#相异度为0.25
  94. plot(METree2,xlab="",sub="",main="Clustering of ME after combined")# METree2是合并后的模块
  95. dev.off()

  96. pdf(file="figures/Step03-MergedDynamics.pdf",width=10,height=4)
  97. plotDendroAndColors(dendro = geneTree,#剪切树
  98. colors = cbind(dynamicColors,mergedColors),#将两种方法形成的模块颜色合并在一起
  99. groupLabels = c("Dynamic Tree Cut","Merged Dynamics"),
  100. dendroLabels = FALSE,
  101. hang = 0.03,
  102. addGuide=TRUE,
  103. guideHang=0.05,
  104. main="Gene Dendrogram and module colors")
  105. dev.off()

  106. # 模块中基因数
  107. write.table(table(moduleColors),"data/Step03-MEgeneCount.txt",quote = F,row.names = F)

  108. # 保存构建的网络信息
  109. moduleColors=mergedColors
  110. colorOrder=c("grey", standardColors(50))
  111. moduleLabels=match(moduleColors, colorOrder)-1
  112. MEs=mergedMEs
  113. save(MEs, moduleLabels, moduleColors, geneTree, file="data/Step03-Step_by_step_buildnetwork.rda")




  114. ## 绘制样本间的相关性
  115. load("data/Step01-WGCNA_input.Rda")
  116. load(file="data/Step03-Step_by_step_buildnetwork.rda")


  117. MEs = orderMEs(MEs)

  118. sizeGrWindow(5,7.5);
  119. pdf(file = "figures/Step03-moduleCor.pdf", width = 5, height = 7.5);

  120. par(cex = 0.9)
  121. plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle
  122. = 90)
  123. dev.off()

  124. ## TOMplot

  125. dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = softpower);

  126. nSelect = 400
  127. # 随机选取400个基因进行可视化,s设置seed值,保证结果的可重复性
  128. set.seed(10);
  129. select = sample(nGenes, size = nSelect);
  130. selectTOM = dissTOM[select, select];
  131. # 对选取的基因进行重新聚类
  132. selectTree = hclust(as.dist(selectTOM), method = "average")
  133. selectColors = mergedColors[select];
  134. # 打开一个绘图窗口
  135. sizeGrWindow(9,9)
  136. pdf(file = "figures/Step03-TOMplot.pdf", width = 9, height = 9);
  137. # 美化图形的设置
  138. plotDiss = selectTOM^7;
  139. diag(plotDiss) = NA;
  140. TOMplot(plotDiss, selectTree, selectColors,
  141. main = "Network heatmap plot, selected genes")
  142. dev.off()
复制代码


回复 显示全部楼层 道具 举报

发表于 2021-10-20 08:50:50
第五步:模块功能分析GO和KEGG
  1. # 清空环境变量
  2. rm(list = ls())
  3. getwd()
  4. setwd('/home/xhs/helix/level2/L64_Codes/6_4_WGCNA_Analysis/')
  5. # 加载包
  6. library(WGCNA)
  7. # 加载表达矩阵
  8. load("data/Step01-WGCNA_input.Rda")

  9. # 读入临床信息
  10. clinical = read.table("data/clinical.txt",stringsAsFactors = TRUE, header = T,row.names = 1,na.strings = "",sep = "\t")
  11. # 查看临床信息
  12. head(clinical)
  13. # 对表达矩阵进行预处理
  14. datTraits = as.data.frame(do.call(cbind,lapply(clinical, as.numeric)))
  15. rownames(datTraits) = rownames(clinical)

  16. # 对样本进行聚类
  17. sampleTree2 = hclust(dist(datExpr), method = "average")

  18. # 将临床信息转换为颜色,白色表示低,红色表示高,灰色表示缺失
  19. traitColors = numbers2colors(datTraits, signed = FALSE)

  20. pdf(file = "figures/Step04-Sample_dendrogram_and_trait_heatmap.pdf", width = 24);
  21. # 样本聚类图与样本性状热图
  22. plotDendroAndColors(sampleTree2,
  23. traitColors,
  24. groupLabels = names(datTraits),
  25. main = "Sample dendrogram and trait heatmap")
  26. dev.off()

  27. #### 网络的分析
  28. ###### 基因模块与临床信息的关系
  29. # 加载构建的网络
  30. load(file = "data/Step03-Step_by_step_buildnetwork.rda")
  31. # 对模块特征矩阵进行排序
  32. MEs=orderMEs(MEs)
  33. #计算模型特征矩阵和样本信息矩阵的相关度。
  34. moduleTraitCor=cor(MEs, datTraits, use="p")
  35. write.table(file="data/Step04-modPhysiological.cor.xls",moduleTraitCor,sep="\t",quote=F)
  36. moduleTraitPvalue=corPvalueStudent(moduleTraitCor, nSamples)
  37. write.table(file="data/Step04-modPhysiological.p.xls",moduleTraitPvalue,sep="\t",quote=F)

  38. #使用labeledHeatmap()将上述相关矩阵和p值可视化。
  39. pdf(file="figures/Step04-Module_trait_relationships.pdf",width=9,height=7)
  40. textMatrix=paste(signif(moduleTraitCor,2),"\n(",signif(moduleTraitPvalue,1),")",sep="")
  41. dim(textMatrix)=dim(moduleTraitCor)
  42. # 基因模块与临床信息相关性图
  43. labeledHeatmap(Matrix=moduleTraitCor,#模块和表型的相关性矩阵,这个参数最重要,其他可以不变
  44. xLabels=colnames(datTraits),
  45. yLabels=names(MEs),
  46. ySymbols=names(MEs),
  47. colorLabels=FALSE,
  48. colors=blueWhiteRed(50),
  49. textMatrix=textMatrix,
  50. setStdMargins=FALSE,
  51. cex.text=0.5,
  52. cex.lab=0.5,
  53. zlim=c(-1,1),
  54. main=paste("Module-trait relationships"))
  55. dev.off()


  56. ## 单一模块与某一表型相关性
  57. M_stage = as.data.frame(datTraits$M_stage)
  58. # 分析自己感兴趣的临床信息,此处以M_stage为示例
  59. names(M_stage) = "M_stage"
  60. # 模块对应的颜色
  61. modNames = substring(names(MEs), 3)

  62. # 计算基因模块特征
  63. geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
  64. MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));

  65. # 对结果进行命名
  66. names(geneModuleMembership) = paste("MM", modNames, sep="");
  67. names(MMPvalue) = paste("p.MM", modNames, sep="");

  68. # 计算M分期基因特征显著性
  69. geneTraitSignificance = as.data.frame(cor(datExpr, M_stage, use = "p"));
  70. GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));

  71. # 对结果进行命名
  72. names(geneTraitSignificance) = paste("GS.", names(M_stage), sep="")
  73. names(GSPvalue) = paste("p.GS.", names(M_stage), sep="")

  74. # 设置需要分析的模块名称,此处为brown模块
  75. module = "brown"
  76. # 提取brown模块数据
  77. column = match(module, modNames);
  78. moduleGenes = moduleColors==module;

  79. # 可视化brown模块与M分期的相关性分析结果
  80. sizeGrWindow(7, 7);
  81. pdf(file="figures/Step04-Module_membership_vs_gene_significance.pdf")
  82. par(mfrow = c(1,1));
  83. verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
  84. abs(geneTraitSignificance[moduleGenes, 1]),
  85. xlab = paste("Module Membership in", module, "module"),
  86. ylab = "Gene significance for M Stage",
  87. main = paste("Module membership vs. gene significance\n"),
  88. cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
  89. dev.off()

  90. ## 单一特征与所有模块关联分析
  91. GS = as.numeric(cor(datTraits$M_stage,datExpr, use="p"))
  92. GeneSignificance = abs(GS);
  93. pdf(file="figures/Step04-Gene_significance_for_M_stage_across_module.pdf",width=9,height=5)
  94. plotModuleSignificance(GeneSignificance, moduleColors, ylim=c(0,0.2), main="Gene significance for M stage across module");
  95. dev.off()

  96. ## 模块中的hub基因
  97. #### 为每一个模块寻找hub基因
  98. HubGenes <- chooseTopHubInEachModule(datExpr,#WGCNA分析输入的表达矩阵
  99. moduleColors)#模块颜色信息
  100. # 保存hub基因结果
  101. write.table (HubGenes,file = "data/Step04-HubGenes_of_each_module.xls",quote=F,sep='\t',col.names = F)

  102. #### 与某种特征相关的hub基因
  103. NS = networkScreening(datTraits$M_stage,#M分期
  104. MEs,#
  105. datExpr)#WGCNA分析输入的表达矩阵
  106. # 将结果写入到文件
  107. write.table(NS,file="data/Step04-Genes_for_M_stage.xls",quote=F,sep='\t')

  108. ## 模块GO/KEGG分析
  109. # 加载R包
  110. library(anRichment)
  111. library(clusterProfiler)
  112. ##### GO分析
  113. # 构建GO背景基因集
  114. GOcollection = buildGOcollection(organism = "human")
  115. geneNames = colnames(datExpr)
  116. # 将基因SYMBOL转换为ENTREZID基因名
  117. geneID = bitr(geneNames,fromType = "SYMBOL", toType = "ENTREZID",
  118. OrgDb = "org.Hs.eg.db", drop = FALSE)
  119. # 将基因名对应结果写入文件中
  120. write.table(geneID, file = "data/Step04-geneID_map.xls", sep = "\t", quote = TRUE, row.names = FALSE)
  121. # 进行GO富集分析
  122. GOenr = enrichmentAnalysis(classLabels = moduleColors,#基因所在的模块信息
  123. identifiers = geneID$ENTREZID,
  124. refCollection = GOcollection,
  125. useBackground = "given",
  126. threshold = 1e-4,
  127. thresholdType = "Bonferroni",
  128. getOverlapEntrez = TRUE,
  129. getOverlapSymbols = TRUE,
  130. ignoreLabels = "grey");
  131. # 提取结果,并写入结果到文件
  132. tab = GOenr$enrichmentTable
  133. names(tab)
  134. write.table(tab, file = "data/Step04-GOEnrichmentTable.xls", sep = "\t", quote = TRUE, row.names = FALSE)

  135. # 提取主要结果,并写入文件
  136. keepCols = c(1, 3, 4, 6, 7, 8, 13)
  137. screenTab = tab[, keepCols]
  138. # 小数位为2位
  139. numCols = c(4, 5, 6)
  140. screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)

  141. # 给结果命名
  142. colnames(screenTab) = c("module", "GOID", "term name", "p-val", "Bonf", "FDR", "size")
  143. rownames(screenTab) = NULL

  144. # 查看结果
  145. head(screenTab)
  146. # 写入文件中
  147. write.table(screenTab, file = "data/Step04-GOEnrichmentTableScreen.xls", sep = "\t", quote = TRUE, row.names = FALSE)

  148. ##### KEGG富集分析
  149. # AnRichment没有直接提供KEGG数据的背景集
  150. # 这里使用MSigDBCollection构建C2通路数据集
  151. KEGGCollection = MSigDBCollection("data/msigdb_v7.1.xml", MSDBVersion = "7.1",
  152. organism = "human",
  153. excludeCategories = c("h","C1","C3","C4","C5","C6","C7"))
  154. # KEGG分析
  155. KEGGenr = enrichmentAnalysis(classLabels = moduleColors,
  156. identifiers = geneID$ENTREZID,
  157. refCollection = KEGGCollection,
  158. useBackground = "given",
  159. threshold = 1e-4,
  160. thresholdType = "Bonferroni",
  161. getOverlapEntrez = TRUE,
  162. getOverlapSymbols = TRUE,
  163. ignoreLabels = "grey")
  164. # 提取KEGG结果,并写入文件
  165. tab = KEGGenr$enrichmentTable
  166. names(tab)
  167. write.table(tab, file = "data/Step04-KEGGEnrichmentTable.xls", sep = "\t", quote = TRUE, row.names = FALSE)

  168. # 提取主要结果并写入文件
  169. keepCols = c(1, 3, 4, 6, 7, 8, 13)
  170. screenTab = tab[, keepCols]
  171. # 取两位有效数字
  172. numCols = c(4, 5, 6)
  173. screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)

  174. # 对结果表格进行重命名
  175. colnames(screenTab) = c("module", "ID", "term name", "p-val", "Bonf", "FDR", "size")
  176. rownames(screenTab) = NULL
  177. # 查看结果
  178. head(screenTab)
  179. # 写入文件中
  180. write.table(screenTab, file = "data/Step04-KEGGEnrichmentTableScreen.xls", sep = "\t", quote = TRUE, row.names = FALSE)

  181. ### 输出cytoscape可视化
  182. # 重新计算TOM,power值设置为前面选择好的
  183. TOM = TOMsimilarityFromExpr(datExpr, power = 7)
  184. # 输出全部网络模块
  185. cyt = exportNetworkToCytoscape(TOM,
  186. edgeFile = "data/Step04-CytoscapeInput-edges-all.txt",#基因间的共表达关系
  187. nodeFile = "data/Step04-CytoscapeInput-nodes-all.txt",#
  188. weighted = TRUE,
  189. threshold = 0.1,
  190. nodeNames = geneID$SYMBOL,
  191. altNodeNames = geneID$ENTREZID,
  192. nodeAttr = moduleColors)
  193. # 输出感兴趣网络模块
  194. modules = c("brown", "red")
  195. # 选择上面模块中包含的基因
  196. inModule = is.finite(match(moduleColors, modules))
  197. modGenes = geneID[inModule,]
  198. # 选择指定模块的TOM矩阵
  199. modTOM = TOM[inModule, inModule]

  200. # 输出为Cytoscape软件可识别格式
  201. cyt = exportNetworkToCytoscape(modTOM,
  202. edgeFile = paste("data/Step04-CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
  203. nodeFile = paste("data/Step04-CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
  204. weighted = TRUE,
  205. threshold = 0.2,
  206. nodeNames = modGenes$SYMBOL,
  207. altNodeNames = modGenes$ENTREZID,
  208. nodeAttr = moduleColors[inModule])
复制代码
第六步:Cytoscape可视化

回复 显示全部楼层 道具 举报

发表于 2021-10-20 08:51:40
本帖最后由 生信喵 于 2021-11-1 11:57 编辑

代码中需要用到的输入数据:临床信息和转录组数据。获取示例数据请在公众号"生信喵实验柴"后台回复“20211020”。

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有账号?立即注册

回复 显示全部楼层 道具 举报

您需要登录后才可以回帖 登录 | 立即注册

QQ|Archiver|手机版|小黑屋|bioinfoer ( 萌ICP备20244422号 )

GMT+8, 2025-1-29 07:39 , Processed in 0.070311 second(s), 34 queries .

Powered by Discuz! X3.5

© 2001-2025 Discuz! Team.

快速回复 返回顶部 返回列表