|
发表于 2022-10-10 09:25:38
|
查看: 7319 |
回复: 0
背景
分析原始count数据,除了DEseq2包还有一个R包熟为人知,就是edgeR包。今天带来的是教程和实战,使用的数据还是前一篇推文中的。可以关注公众号回复"20221009"获取。
教程地址:
文档:
- http://www.bioconductor.org/packages/release/workflows/vignettes/RNAseq123/inst/doc/designmatrices.html
复制代码 英文:
- http://www.bioconductor.org/packages/release/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html
复制代码 中文:
- http://www.bioconductor.org/packages/release/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow_CHN.html
复制代码- rm(list=ls())
- library(limma)
- library(Glimma)
- library(edgeR)
- setwd('/home/xhs/jyxy/14/')
复制代码
一、读取数据
1.1 方法一:读取 featureCounts 表达矩阵
- countdata <- read.csv("CountMatrix.csv",header = T,row.names = 1)
- head(countdata)
- #过滤表达矩阵
- nrow(countdata)
- countdata <- countdata[rowSums(countdata)>10,]
- nrow(countdata) #21281
- #读取样品信息
- coldata <- read.csv("sample.csv",header = T)
- head(coldata)
- #生成DGElist对象
- dge <- DGEList(counts = countdata, genes = row.names(countdata),group = as.factor(coldata$dex))
- dge
- #添加样品信息
- dge$samples$cell <- as.factor(coldata$cell)
- #备份一份
- dge2 <- dge
复制代码
1.2 方法二:读取 htseq 多个文件表达信息
- setwd("./htseq/")
- files <- dir(pattern = "tsv")
- files
- #查看文件内容
- read.delim(files[1], nrow=5,header = F)
- #生成DGElist对象
- x <- readDGE(files = files,columns = c(1,2))
- x
- #查看属性
- class(x)
- x$samples
- x$counts
- dim(x)
- samplenames <- substring(colnames(x),1,nchar(colnames(x)))
- samplenames
- colnames(x) <- samplenames
- x$samples$group <- as.factor(coldata$dex)
- x$samples$cell <- as.factor(coldata$cell)
- x$samples
复制代码
二、DGElist 对象探索
- barplot(colSums(dge$counts),las=2,col = rep(rainbow(4),each=2))
- opar <- par(no.readonly = TRUE)
- par(mar=c(7.1,4.1,4.1,2.1))
- barplot(colSums(dge$counts),las=2,col = rep(rainbow(4),each=2))
- par(opar)
复制代码
不同样品 reads count
三、对数据计算标准化因子
- # Normalization method: "TMM","TMMwsp","RLE","upperquartile","none"
- dge.tmm <- calcNormFactors(dge,method = "TMM")
- dge.tmm$samples
- plotMDS(dge.tmm)
- dge.tmmwsp <- calcNormFactors(dge,method = "TMMwsp")
- dge.tmmwsp$samples
- plotMDS(dge.tmmwsp)
- dge.upperquartile <- calcNormFactors(dge,method = "upperquartile")
- dge.upperquartile$samples
- plotMDS(dge.upperquartile)
- dge.rle <- calcNormFactors(dge,method = "RLE") # DESeq2, cuffdiff
- dge.rle$samples
- plotMDS(dge.rle)
- dge.none <- calcNormFactors(dge,method = "none")
- dge.none$samples
- plotMDS(dge.none)
- dge <- calcNormFactors(dge,method = "TMM")
- dge
- dge$samples
- #检查样本中的异常值,使用plotMDS()作图。
- plotMDS(dge)
- plotMDS(dge,labels = dge$samples$cell,col=rep(rainbow(4),each=2))
复制代码
MDS 图
四、差异表达计算
- #1 生成实验矩阵
- model.matrix( ~ group,data = dge$samples)
- design.matrix <- model.matrix( ~ cell + group ,data = dge$samples)
- rownames(design.matrix) <- rownames(dge$samples)
- design.matrix
- #2 估计离散系数(dispersion)
- dge <- estimateDisp(dge,design = design.matrix)
- dge$common.dispersion
- dge$tagwise.dispersion
- #可以用BCV plot查看离散系数;
- plotBCV(dge, cex = 0.8)
复制代码
BCV plot 离散系数
- # plot var and mean
- plotMeanVar(dge, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE)
复制代码
var and mean 图
- #3 exactTest检验
- dge_res <- exactTest(dge)
- dge_results <- as.data.frame(topTags(dge_res,n=nrow(dge$counts),sort.by = "logFC"))
- View(dge_results)
- #3 最大似然法检测
- fit <- glmFit(y = dge, design = design.matrix)
- lrt <- glmLRT(fit, coef=2)
- lrt_results <- as.data.frame(topTags(lrt,n=nrow(dge$counts),sort.by = "logFC"))
- View(lrt_results)
- #4 筛选差异表达基因
- write.csv(x = dge_results,file = "dge_results.csv")
- #筛选出差异表达基因,logFC >=1或者<=-1,并且q值小于0.05的基因
- res <- na.omit(dge_results) #21281
- res0.05 <- res[(abs(res$logFC)>=1 & res$FDR <=0.05), ]
- nrow(res0.05)#1531
- #筛选出差异表达基因,logFC >=2,并且q值小于0.01的基因
- res0.01 <- res[(abs(res$logFC)>=1 & res$FDR <=0.01), ]
- nrow(res0.01)#1286
- write.csv(res0.05,file = "res0.05.csv")
- #找出差异表达最显著的基因,按log2差异倍数排序
- head(res0.05[order(res0.05$logFC,decreasing = TRUE), ])
- head(res0.05[order(res0.05$FDR,decreasing = TRUE), ])
复制代码
五、结果可视化
X-axis (A) 表示两个样品的⼏何平均数;
Y-axis (M) 表示两个样品的 fold-change;
- #绘制MA plot
- select.sign.gene = decideTestsDGE(dge_res, p.value = 0.01)
- select.sign.gene_id = rownames(dge_res)[as.logical(select.sign.gene)]
- plotSmear(dge_res, de.tags = select.sign.gene_id, cex = 0.5,ylim=c(-4,4))
- abline(h = c(-2, 2), col = "blue")
复制代码
MA 图
- #绘制差异基因散点图(类似火山图);
- plotMD(dge_res,ylim=c(-10,10))
- abline(h=c(-1, 1), col="green")
复制代码
差异基因散点图
- #热图
- cpm <- cpm(dge$counts, log=FALSE)
- geneid <- rownames(res0.05[order(res0.05$logFC,decreasing = TRUE), ])
- cpm <- cpm[as.factor(geneid[1:50]),]
- library(pheatmap)
- pheatmap(cpm)
复制代码
差异基因表达热图
|
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有账号?立即注册
|