四、画图
你可以把自己的数据整理成20230905.txt这样的格式,就可以跳过上面调整格式的步骤。 第一列是基因名,后面几列依次是各个sample里motif的pvalue,然后是motif的名字,后面是FPKM值。
mergefpkm<-read.table("20230905.txt",header = T,as.is = 1)
head(mergefpkm)
##melt dataframe and draw figues
# install.packages('reshape2',destdir = 'C:/Users/haish/R/RStudiowork/downloaded_packages')
library(reshape2)
library(ggplot2)
m1<-melt(mergefpkm[,1:9],id.var=c("id","gene"),variable.name = "type",value.name = "-logP")
m2<-melt(mergefpkm[,c(1,9,13:17)],id.var=c("id","gene"),variable.name = "type",value.name = "FPKM")
m3<-rbind(m2,m2[m2$type=="mESC",],m2[m2$type=="mESC",])
# table(m2$type=="mESC")
m4<-cbind(m1,m3)
m5<-m4[,c(1:4,8)]
#set logP range and FPKM range
quantile(m5$`-logP`,probs = c(0:10)/10)
m5[m5$`-logP`>500,]$`-logP` <- 500
quantile(m5$`-logP`,probs = c(0:10)/10)
m5[m5$`-logP`<10,]$`-logP` <- 10
quantile(m5$`-logP`,probs = c(0:10)/10)
quantile(m5$FPKM,probs = c(0:10)/10)
m5[m5$FPKM>20,]$FPKM<-20
quantile(m5$FPKM,probs = c(0:10)/10)
#filtered_motifs.pdf,按照paper的描述筛选出来的motif
p1<-ggplot(m5,aes(type,id,size=`-logP`))+
geom_point(shape=21,aes(fill=FPKM),position =position_dodge(0))+
theme_minimal()+xlab(NULL)+ylab(NULL)+
scale_size_continuous(range=c(1,8))+
scale_fill_gradientn(colours=c("#2381B3","#F0E366"),guide="legend")+
theme(legend.position = "bottom",legend.box = "vertical",panel.grid.major =element_blank() )+
scale_y_discrete(labels=m5$id)
ggsave(file="filtered_motifs.pdf",height=18,width = 9)
#filtered_genes.pdf,上述motif对应的转录因子
p2<-ggplot(m5,aes(type,id,size=`-logP`))+
geom_point(shape=21,aes(fill=FPKM),position =position_dodge(0))+
theme_minimal()+xlab(NULL)+ylab(NULL)+
scale_size_continuous(range=c(1,8))+
scale_fill_gradientn(colours=c("#2381B3","#F0E366"),guide="legend")+
theme(legend.position = "bottom",legend.box = "vertical",panel.grid.major =element_blank(),legend.margin=margin(t= 0, unit='cm'),legend.spacing = unit(0,"in"))+
scale_size_continuous(range=c(0,10))+
scale_y_discrete(labels=m5$gene)
ggsave(file="filtered_genes.pdf",height=20,width = 4)
查看filtered_motifs.pdf,是按照paper的描述筛选出来的motif;
查看filtered_genes.pdf,是上述motif对应的转录因子。
可以看出,比paper的图多出了很多转录因子。而且同一转录因子对应多个motif,paper里的只出现一个motif,作者是怎么处理的呢?
paper作者Jingyi答复:
I think the reason why you have more motif is the different version of motif database we are using. The one I used in 2016 paper was an old one with relatively less motif included.
If one gene has multiple enrichment value, I choose the highest enrichment value to assign to that gene.
为了模仿出原版的效果,我们把paper里出现的转录因子对应的数据提取出来,重新画图。
##filtering with motifs in the nature paper:The landscape of accessible chromatin in mammalian preimplantation embryos
papermotif<-c("GABPA","NANOG","SOX2","POU5F1","TFAP2C","GATA4","GATA3","GATA1","TEAD4","FOXA1","RARG","NR5A2","ESRRB","CTCF","KLF4")
pmergefpkm<-merge(mergefpkm,data.frame(gene=papermotif),by="gene")
pmergefpkm<-merge(mergefpkm,data.frame(gene=papermotif,sortid=letters[1:length(papermotif)]),by="gene")
pmergefpkm<-pmergefpkm[order(pmergefpkm$sortid),]
pmergefpkm$id<-apply(pmergefpkm,1,function(x)(paste(x[18],x[9],sep="-")))
# length(pmergefpkm$id) #21
# length(papermotif) #15
#按照文章作者回应:同一个基因取motif表达值最大的
mean(pmergefpkm[2:7])
pmergefpkm$meanmotif <- apply(pmergefpkm[2:7], 1, mean)
pmergefpkm <- pmergefpkm[order(pmergefpkm$sortid,pmergefpkm$meanmotif,decreasing = T),]
pmergefpkm <- pmergefpkm[!duplicated(pmergefpkm$gene),]
pmergefpkm<-pmergefpkm[order(pmergefpkm$sortid),]
pm1<-melt(pmergefpkm[,1:9],id.var=c("id","gene"),variable.name = "type",value.name = "-logP")
pm2<-melt(pmergefpkm[,c(1,9,13:17)],id.var=c("id","gene"),variable.name = "type",value.name = "FPKM")
pm3<-rbind(pm2,pm2[pm2$type=="mESC",],pm2[pm2$type=="mESC",])
pm4<-cbind(pm1,pm3)
pm5<-pm4[,c(1:4,8)]
pm5[pm5$`-logP`>500,]$`-logP`<-500
pm5[pm5$`-logP`<10,]$`-logP`<-10
pm5[pm5$FPKM>20,]$FPKM<-20
#paper_motifs.pdf,paper的图里出现的转录因子对应的motif
p3<-ggplot(pm5,aes(type,id,size=`-logP`))+
geom_point(shape=21,aes(fill=FPKM),position =position_dodge(0))+
theme_minimal()+xlab(NULL)+ylab(NULL)+
scale_size_continuous(range=c(1,15))+
scale_fill_gradientn(colours=c("#2381B3","#F0E366"),guide="legend")+
theme(legend.position = "bottom",legend.box = "vertical",panel.grid.major =element_blank(),legend.margin=margin(t= 0, unit='cm'),legend.spacing = unit(0,"in"))+
scale_size_continuous(range=c(0,10))
ggsave(file="paper_motifs.pdf",height=8.4,width = 9)
#paper_gene.pdf,paper的图里出现的转录因子
ggplot(pm5,aes(type,id,size=`-logP`))+
geom_point(shape=21,aes(fill=FPKM),position =position_dodge(0))+
theme_minimal()+
xlab(NULL)+ylab(NULL)+
scale_size_continuous(range=c(1,15))+
scale_fill_gradientn(colours=c("#2381B3","#F0E366"),guide="legend")+
theme(legend.position = "bottom",legend.box = "vertical",panel.grid.major =element_blank(),
legend.margin=margin(t= 0, unit='cm'),legend.spacing = unit(0,"in"))+
scale_size_continuous(range=c(0,10))+scale_y_discrete(labels=pm5$gene)
ggsave(file="paper_gene.pdf",height=8.4,width = 4.3)