|
发表于 2023-5-11 17:16:27
|
查看: 6226 |
回复: 3
一、软件介绍
DADA2(Divisive Amplicon Denoising Algorithm 2)是一个用于建模和修正多种测序平台(Illumina、Roche 454)测序扩增子错误的开源软件(R)包。在扩增子分析处理流程中,
DADA2 算法能够准确地推断样本序列并寻找出单个核苷酸的差异(往往能够比其他方法识别出更多真实变体和输出更少的虚假序列)。
DADA2 是一个通过构建错误率模型来推测扩增子序列是否来自模板的算法,以自身数据的错误模型为参数,不用依赖于其他参数分布模型。DADA2 最重要的优势是用了更多的数据,错误模型包含了质量信息,而其他的方法都在过滤低质量之后把序列的质量信息忽略。
DADA2 的错误模型也包括了定量的丰度,而且该模型也计算了各种不同转置的概率。和比较 OTU 数据库的聚类算法不同,DADA2 采取的是降噪算法。聚类算法与降噪算法的差异与优劣。
- 教程:http://benjjneb.github.io/dada2/tutorial.html
- github:https://benjjneb.github.io/dada2/index.html
- 案例:https://astrobiomike.github.io/amplicon/workflow_ex#analysis-in-r
- 案例数据:https://mothur.s3.us-east-2.amazonaws.com/wiki/miseqsopdata.zip
- 练习数据:
- https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/28773936/dada2_amplicon_ex_workflow.tar.gz
复制代码 dada2 要求数据具备下面三个条件。
1、样品已被拆分好,即每个样品一个 fq/fastq 文件(或者双端成对 fq 文件);
2、已经去除非生物核酸序列,比如:引物(primers),接头(adapters or barcodes),linker等;
3、如果样品是下机的双端测序,其应具有双端测序的相匹配的两个 fq 文件。
二、使用 dada2 分析 16S 数据
1、准备工作
安装软件包
- # install.packages('BiocManager',destdir = 'D:/R/Rstudiowork/downloaded_packages/')
- #BiocManager::install("dada2",destdir = 'D:/R/Rstudiowork/downloaded_packages/')
- #BiocManager::install("phyloseq",destdir = 'D:/R/Rstudiowork/downloaded_packages/')
- #BiocManager::install("DECIPHER",destdir = 'D:/R/Rstudiowork/downloaded_packages/')
- #BiocManager::install("vegan")
- #BiocManager::install("msa",destdir = 'D:/R/Rstudiowork/downloaded_packages/')
- # install.packages('shiny',destdir = 'D:/R/Rstudiowork/downloaded_packages/')
- shiny::runGitHub("shiny-phyloseq","joey711")
- install.packages("phangorn",destdir = 'D:/R/Rstudiowork/downloaded_packages/')
复制代码 下载数据
- # MiSeq SOP练习数据
- #https://mothur.s3.us-east-2.amazonaws.com/wiki/miseqsopdata.zip
- #DECIPHER注释库
- #http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData
- #DADA2 SILVA库
- #https://doi.org/10.5281/zenodo.4587954
复制代码 加载软件包
- library(dada2)
- packageVersion("dada2")
- library(phyloseq)
- library(DECIPHER)
- library(ggplot2)
复制代码 设置工作目录
- getwd()
- path <- "./MiSeq_SOP"
- list.files(path)
复制代码
2、读入数据
- #定义reads1
- fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
- fnFs
- #定义reads2
- fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
- #提取样品名
- sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
- sample.names
复制代码
3、低质量过滤
- #质控
- plotQualityProfile(fnFs)
- plotQualityProfile(fnRs)
- #查看1-2样品
- plotQualityProfile(fnFs[1:2])
- #定义输出文件名
- filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
- filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
- names(filtFs) <- sample.names
- names(filtRs) <- sample.names
- filtFs
- filtRs
- #过滤
- out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160),
- maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
- compress=TRUE, multithread=FALSE)
- # On Windows set multithread=FALSE
- head(out)
- #查看对象
- class(out)
- #查看维度
- dim(out)
- #过滤后质控
- plotQualityProfile(filtFs)
- plotQualityProfile(filtRs)
- plotQualityProfile(filtFs[1:2])
复制代码
4、生成错误率模型
- #生成reads1 错误率模型
- errF <- learnErrors(filtFs, multithread=12)
- #生成reads2 错误率模型
- errR <- learnErrors(filtRs, multithread=12)
- #可视化
- plotErrors(errF, nominalQ=TRUE)
复制代码
5、降噪去重复
- #核心步骤,降噪,聚类,生成ASV
- dadaFs <- dada(filtFs, err=errF, multithread=12)
- dadaRs <- dada(filtRs, err=errR, multithread=12)
- dadaFs[[1]]
- dadaFs$F3D0$clustering
- str(dadaFs[[1]])
复制代码
6、合并 reads
- mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
- #查看结果
- head(mergers[[1]])
- class(mergers)
- length(mergers)
- names(mergers)
- class(mergers$F3D0)
- names(mergers$F3D0)
复制代码
7、生成功能表
- seqtab <- makeSequenceTable(mergers)
- dim(seqtab)
- #统计序列长度分布
- table(nchar(getSequences(seqtab)))
复制代码
8、识别嵌合体
- seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=12, verbose=TRUE)
- dim(seqtab.nochim)
复制代码
9、统计结果
- #定义函数
- getN <- function(x) sum(getUniques(x))
- track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
- colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
- rownames(track) <- sample.names
- head(track)
- #保存结果
- write.table(track, "read-count-tracking.tsv", quote=FALSE, sep="\t", col.names=NA)
复制代码
|
|