第五步:模块功能分析GO和KEGG
- # 清空环境变量
- rm(list = ls())
- getwd()
- setwd('/home/xhs/helix/level2/L64_Codes/6_4_WGCNA_Analysis/')
- # 加载包
- library(WGCNA)
- # 加载表达矩阵
- load("data/Step01-WGCNA_input.Rda")
- # 读入临床信息
- clinical = read.table("data/clinical.txt",stringsAsFactors = TRUE, header = T,row.names = 1,na.strings = "",sep = "\t")
- # 查看临床信息
- head(clinical)
- # 对表达矩阵进行预处理
- datTraits = as.data.frame(do.call(cbind,lapply(clinical, as.numeric)))
- rownames(datTraits) = rownames(clinical)
- # 对样本进行聚类
- sampleTree2 = hclust(dist(datExpr), method = "average")
- # 将临床信息转换为颜色,白色表示低,红色表示高,灰色表示缺失
- traitColors = numbers2colors(datTraits, signed = FALSE)
- pdf(file = "figures/Step04-Sample_dendrogram_and_trait_heatmap.pdf", width = 24);
- # 样本聚类图与样本性状热图
- plotDendroAndColors(sampleTree2,
- traitColors,
- groupLabels = names(datTraits),
- main = "Sample dendrogram and trait heatmap")
- dev.off()
- #### 网络的分析
- ###### 基因模块与临床信息的关系
- # 加载构建的网络
- load(file = "data/Step03-Step_by_step_buildnetwork.rda")
- # 对模块特征矩阵进行排序
- MEs=orderMEs(MEs)
- #计算模型特征矩阵和样本信息矩阵的相关度。
- moduleTraitCor=cor(MEs, datTraits, use="p")
- write.table(file="data/Step04-modPhysiological.cor.xls",moduleTraitCor,sep="\t",quote=F)
- moduleTraitPvalue=corPvalueStudent(moduleTraitCor, nSamples)
- write.table(file="data/Step04-modPhysiological.p.xls",moduleTraitPvalue,sep="\t",quote=F)
- #使用labeledHeatmap()将上述相关矩阵和p值可视化。
- pdf(file="figures/Step04-Module_trait_relationships.pdf",width=9,height=7)
- textMatrix=paste(signif(moduleTraitCor,2),"\n(",signif(moduleTraitPvalue,1),")",sep="")
- dim(textMatrix)=dim(moduleTraitCor)
- # 基因模块与临床信息相关性图
- labeledHeatmap(Matrix=moduleTraitCor,#模块和表型的相关性矩阵,这个参数最重要,其他可以不变
- xLabels=colnames(datTraits),
- yLabels=names(MEs),
- ySymbols=names(MEs),
- colorLabels=FALSE,
- colors=blueWhiteRed(50),
- textMatrix=textMatrix,
- setStdMargins=FALSE,
- cex.text=0.5,
- cex.lab=0.5,
- zlim=c(-1,1),
- main=paste("Module-trait relationships"))
- dev.off()
- ## 单一模块与某一表型相关性
- M_stage = as.data.frame(datTraits$M_stage)
- # 分析自己感兴趣的临床信息,此处以M_stage为示例
- names(M_stage) = "M_stage"
- # 模块对应的颜色
- modNames = substring(names(MEs), 3)
- # 计算基因模块特征
- geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
- MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
- # 对结果进行命名
- names(geneModuleMembership) = paste("MM", modNames, sep="");
- names(MMPvalue) = paste("p.MM", modNames, sep="");
- # 计算M分期基因特征显著性
- geneTraitSignificance = as.data.frame(cor(datExpr, M_stage, use = "p"));
- GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
- # 对结果进行命名
- names(geneTraitSignificance) = paste("GS.", names(M_stage), sep="")
- names(GSPvalue) = paste("p.GS.", names(M_stage), sep="")
- # 设置需要分析的模块名称,此处为brown模块
- module = "brown"
- # 提取brown模块数据
- column = match(module, modNames);
- moduleGenes = moduleColors==module;
- # 可视化brown模块与M分期的相关性分析结果
- sizeGrWindow(7, 7);
- pdf(file="figures/Step04-Module_membership_vs_gene_significance.pdf")
- par(mfrow = c(1,1));
- verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
- abs(geneTraitSignificance[moduleGenes, 1]),
- xlab = paste("Module Membership in", module, "module"),
- ylab = "Gene significance for M Stage",
- main = paste("Module membership vs. gene significance\n"),
- cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
- dev.off()
- ## 单一特征与所有模块关联分析
- GS = as.numeric(cor(datTraits$M_stage,datExpr, use="p"))
- GeneSignificance = abs(GS);
- pdf(file="figures/Step04-Gene_significance_for_M_stage_across_module.pdf",width=9,height=5)
- plotModuleSignificance(GeneSignificance, moduleColors, ylim=c(0,0.2), main="Gene significance for M stage across module");
- dev.off()
- ## 模块中的hub基因
- #### 为每一个模块寻找hub基因
- HubGenes <- chooseTopHubInEachModule(datExpr,#WGCNA分析输入的表达矩阵
- moduleColors)#模块颜色信息
- # 保存hub基因结果
- write.table (HubGenes,file = "data/Step04-HubGenes_of_each_module.xls",quote=F,sep='\t',col.names = F)
- #### 与某种特征相关的hub基因
- NS = networkScreening(datTraits$M_stage,#M分期
- MEs,#
- datExpr)#WGCNA分析输入的表达矩阵
- # 将结果写入到文件
- write.table(NS,file="data/Step04-Genes_for_M_stage.xls",quote=F,sep='\t')
- ## 模块GO/KEGG分析
- # 加载R包
- library(anRichment)
- library(clusterProfiler)
- ##### GO分析
- # 构建GO背景基因集
- GOcollection = buildGOcollection(organism = "human")
- geneNames = colnames(datExpr)
- # 将基因SYMBOL转换为ENTREZID基因名
- geneID = bitr(geneNames,fromType = "SYMBOL", toType = "ENTREZID",
- OrgDb = "org.Hs.eg.db", drop = FALSE)
- # 将基因名对应结果写入文件中
- write.table(geneID, file = "data/Step04-geneID_map.xls", sep = "\t", quote = TRUE, row.names = FALSE)
- # 进行GO富集分析
- GOenr = enrichmentAnalysis(classLabels = moduleColors,#基因所在的模块信息
- identifiers = geneID$ENTREZID,
- refCollection = GOcollection,
- useBackground = "given",
- threshold = 1e-4,
- thresholdType = "Bonferroni",
- getOverlapEntrez = TRUE,
- getOverlapSymbols = TRUE,
- ignoreLabels = "grey");
- # 提取结果,并写入结果到文件
- tab = GOenr$enrichmentTable
- names(tab)
- write.table(tab, file = "data/Step04-GOEnrichmentTable.xls", sep = "\t", quote = TRUE, row.names = FALSE)
- # 提取主要结果,并写入文件
- keepCols = c(1, 3, 4, 6, 7, 8, 13)
- screenTab = tab[, keepCols]
- # 小数位为2位
- numCols = c(4, 5, 6)
- screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)
- # 给结果命名
- colnames(screenTab) = c("module", "GOID", "term name", "p-val", "Bonf", "FDR", "size")
- rownames(screenTab) = NULL
- # 查看结果
- head(screenTab)
- # 写入文件中
- write.table(screenTab, file = "data/Step04-GOEnrichmentTableScreen.xls", sep = "\t", quote = TRUE, row.names = FALSE)
- ##### KEGG富集分析
- # AnRichment没有直接提供KEGG数据的背景集
- # 这里使用MSigDBCollection构建C2通路数据集
- KEGGCollection = MSigDBCollection("data/msigdb_v7.1.xml", MSDBVersion = "7.1",
- organism = "human",
- excludeCategories = c("h","C1","C3","C4","C5","C6","C7"))
- # KEGG分析
- KEGGenr = enrichmentAnalysis(classLabels = moduleColors,
- identifiers = geneID$ENTREZID,
- refCollection = KEGGCollection,
- useBackground = "given",
- threshold = 1e-4,
- thresholdType = "Bonferroni",
- getOverlapEntrez = TRUE,
- getOverlapSymbols = TRUE,
- ignoreLabels = "grey")
- # 提取KEGG结果,并写入文件
- tab = KEGGenr$enrichmentTable
- names(tab)
- write.table(tab, file = "data/Step04-KEGGEnrichmentTable.xls", sep = "\t", quote = TRUE, row.names = FALSE)
- # 提取主要结果并写入文件
- keepCols = c(1, 3, 4, 6, 7, 8, 13)
- screenTab = tab[, keepCols]
- # 取两位有效数字
- numCols = c(4, 5, 6)
- screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)
- # 对结果表格进行重命名
- colnames(screenTab) = c("module", "ID", "term name", "p-val", "Bonf", "FDR", "size")
- rownames(screenTab) = NULL
- # 查看结果
- head(screenTab)
- # 写入文件中
- write.table(screenTab, file = "data/Step04-KEGGEnrichmentTableScreen.xls", sep = "\t", quote = TRUE, row.names = FALSE)
- ### 输出cytoscape可视化
- # 重新计算TOM,power值设置为前面选择好的
- TOM = TOMsimilarityFromExpr(datExpr, power = 7)
- # 输出全部网络模块
- cyt = exportNetworkToCytoscape(TOM,
- edgeFile = "data/Step04-CytoscapeInput-edges-all.txt",#基因间的共表达关系
- nodeFile = "data/Step04-CytoscapeInput-nodes-all.txt",#
- weighted = TRUE,
- threshold = 0.1,
- nodeNames = geneID$SYMBOL,
- altNodeNames = geneID$ENTREZID,
- nodeAttr = moduleColors)
- # 输出感兴趣网络模块
- modules = c("brown", "red")
- # 选择上面模块中包含的基因
- inModule = is.finite(match(moduleColors, modules))
- modGenes = geneID[inModule,]
- # 选择指定模块的TOM矩阵
- modTOM = TOM[inModule, inModule]
- # 输出为Cytoscape软件可识别格式
- cyt = exportNetworkToCytoscape(modTOM,
- edgeFile = paste("data/Step04-CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
- nodeFile = paste("data/Step04-CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
- weighted = TRUE,
- threshold = 0.2,
- nodeNames = modGenes$SYMBOL,
- altNodeNames = modGenes$ENTREZID,
- nodeAttr = moduleColors[inModule])
复制代码 第六步:Cytoscape可视化
|