module的可重复性reproducible (or preserved)
上面找出来的module是否真的存在?它是否会随机出现?需要通过模块的preservation分析,证实module的可重复性。
由于做preservation分析需要两个表达矩阵,即训练和测试,这里我随机切分之前的表达矩阵,来对训练数据集做保守性分析,当然最好一开始就做切分,我这里直接再次切分,给大家提供参考。
1. 建立训练集和测试集
library(caret)
inTraining <- createDataPartition(datExpr$NM_008655, p = 0.75, list = FALSE)
train<- datExpr[inTraining,]
test<-datExpr[-inTraining,]
setLabels = c("Train", "Test");
multiExpr = list(Train = list(data = train), Test = list(data = test));
moduleColors = labels2colors(net$colors)
multiColor = list(Train = moduleColors);
2. preservation分析
nPermutations官网上给了200,此处为节省时间,nPermutations设置为20
```{r}
# > gsg = goodSamplesGenes(train, verbose = 3);
# Flagging genes and samples with too many missing values...
# ..step 1
# > gsg$allOK
# [1] TRUE
# > gsg = goodSamplesGenes(test, verbose = 3);
# Flagging genes and samples with too many missing values...
# ..step 1
# ..Excluding 1 genes from the calculation due to too many missing samples or zero variance.
# ..step 2
# > gsg$allOK
# [1] FALSE
# test需要去除不好的样本或基因
dim(train)
dim(test)
if (!gsg$allOK){
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(test)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(test)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
test = test[gsg$goodSamples, gsg$goodGenes]
}
# Removing genes: NM_008963
train = train[, gsg$goodGenes]# 也去除test的“不好的”基因
datExpr = datExpr[, gsg$goodGenes]# 也去除test的“不好的”基因,所以一开始分好训练和测试集避免后续验证出问题。刚好这里就是重新找模块,因为少了1个基因。power假定还取12
net = blockwiseModules(datExpr, power = 12,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "MyTOM",
verbose = 3)
table(net$colors)
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
# 9 488 421 341 322 262 237 223 214 214 155 53 41 39 35
# 一共15个模块
setLabels = c("Train", "Test");
multiExpr = list(Train = list(data = train), Test = list(data = test));
moduleColors = labels2colors(net$colors)
multiColor = list(Train = moduleColors);
mp =testmp =trainmp = modulePreservation(multiExpr, multiColor,
referenceNetworks = 1,
nPermutations = 20,
randomSeed = 1,
quickCor = 0,
verbose = 3)
save(mp,file = "mp.Rda") #这个很费时间,建议保存一下
推荐的阈值:
Z大于10,代表strong preserved,好的module
大于2小于10代表weak preserved
小于2代表not preserved,不好的module
参考文献:PMID: 21283776
ref = 1
test = 2
statsObs = cbind(mp$quality$observed[[ref]][[test]][, -1], mp$preservation$observed[[ref]][[test]][, -1])
statsZ = cbind(mp$quality$Z[[ref]][[test]][, -1], mp$preservation$Z[[ref]][[test]][, -1]);
dfmp <- cbind(statsObs[, c("medianRank.pres", "medianRank.qual")],
signif(statsZ[, c("Zsummary.pres", "Zsummary.qual")], 2))
dfmp前几行如下,我对z排序了:

其中grey和gold的Z低于2,去掉这两个module
modColors = rownames(mp$preservation$observed[[ref]][[test]])
moduleSizes = mp$preservation$Z[[ref]][[test]][, 1]
plotMods = !(modColors %in% c("grey", "gold"))
preservation的可视化
# Text labels for points
text = modColors[plotMods];
# Auxiliary convenience variable
plotData = cbind(mp$preservation$observed[[ref]][[test]][, 2], mp$preservation$Z[[ref]][[test]][, 2])
# Main titles for the plot
mains = c("Preservation Median rank", "Preservation Zsummary");
sizeGrWindow(10, 5);
pdf("3preservation.pdf",width = 20, height = 10)
par(mfrow = c(1,2))
par(mar = c(4.5,4.5,2.5,1))
for (p in 1:2){
min = min(plotData[, p], na.rm = TRUE);
max = max(plotData[, p], na.rm = TRUE);
# Adjust ploting ranges appropriately
if (p==2){
if (min > -max/10) min = -max/10
ylim = c(min - 0.1 * (max-min), max + 0.1 * (max-min))
} else
ylim = c(min - 0.1 * (max-min), max + 0.1 * (max-min))
plot(moduleSizes[plotMods], plotData[plotMods, p], col = 1, bg = modColors[plotMods], pch = 21,
main = mains[p],
cex = 2.4,
ylab = mains[p], xlab = "Module size", log = "x",
ylim = ylim,
xlim = c(10, 2000), cex.lab = 1.2, cex.axis = 1.2, cex.main =1.4)
labelPoints(moduleSizes[plotMods], plotData[plotMods, p], text, cex = 1, offs = 0.08);
# For Zsummary, add threshold lines
if (p==2){
abline(h=0)
abline(h=2, col = "blue", lty = 2)
abline(h=10, col = "darkgreen", lty = 2)
}
}
dev.off()


把gene module输出到文件
输出每个module内的基因,用于后续分析作图,随便你DIY
color<-unique(moduleColors)
for (i in 1:length(color)) {
y=t(assign(paste(color[i],"expr",sep = "."),datExpr[moduleColors==color[i]]))
write.csv(y,paste(color[i],"csv",sep = "."),quote = F)
}
gene module的深入分析
表型与模块的相关性
通过计算表型与module的相关性,找出相关性高的gene module,推测可能是因为它们造成了表型差异。
samples=read.csv('Sam_info.txt',sep = '\t',row.names = 1)
moduleLabelsAutomatic = net$colors
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsWW = moduleColorsAutomatic
MEs0 = moduleEigengenes(datExpr, moduleColorsWW)$eigengenes
MEsWW = orderMEs(MEs0)
modTraitCor = cor(MEsWW, samples, use = "p")
colnames(MEsWW)
modlues=MEsWW
modTraitP = corPvalueStudent(modTraitCor, nSamples)
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
pdf("4Module-trait.pdf",width = 6, height = 6)
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(samples), yLabels = names(MEsWW), cex.lab = 0.5, yColorWidth=0.01,
xColorWidth = 0.03,
ySymbols = colnames(modlues), colorLabels = FALSE, colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1)
, main = paste("Module-trait relationships"))
dev.off()

此处brown的gene module与NOD_notx和NOD_KLA的相关性高,推测brown module里的基因可能对NOD组的贡献较大。
gene module内基因表达谱的拟合
例子二里的数据是时间序列,作者提取表达量最高的30个基因,取平均值(averaged gene expression of top 30)绘制表达谱曲线。
此处取brown的前30个基因,每种细胞取1个样本,假装是5个时间点:
df30<-read.csv("midnightblue.csv",row.names = 1)[1:30,]
#每种细胞取1个样本,假装是5个时间点
df30time<-df30[,c(1,5,9,13,17)]
#算30个基因的平均值,取log2
dfave<-data.frame(log2(colMeans(df30time)))
dfave$t<-1:5
colnames(dfave)<-c("avelog2","time")
head(dfave)
dfave前几行如下:
