--- title: "motifStack Vignette" author: "Jianhong Ou, Michael Brodsky, Lihua Julie Zhu" date: "`r doc_date()`" package: "`r pkg_ver('motifStack')`" bibliography: ref.bib abstract: > The motifStack package is designed for graphic representation of multiple motifs with different similarity scores. It works with both DNA/RNA sequence motif and amino acid sequence motif. In addition, it provides the flexibility for users to customize the graphic parameters such as the font type and symbol colors. vignette: > %\VignetteIndexEntry{motifStack Vignette} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} output: BiocStyle::html_document --- # Introduction A sequence logo, based on information theory, has been widely used as a graphical representation of sequence conservation (aka motif) in multiple amino acid or nucleic acid sequences. Sequence motif represents conserved characteristics such as DNA binding sites, where transcription factors bind, and catalytic sites in enzymes. Although many tools, such as seqlogo\cite{Oliver2006}, have been developed to create sequence motif and to represent it as individual sequence logo, software tools for depicting the relationship among multiple sequence motifs are still lacking. We developed a flexible and powerful open-source R/Bioconductor package, motifStack, for visualization of the alignment of multiple sequence motifs. # Prepare environment You will need ghostscript. The full path to the executable can be set using the environment variable R\_GSCMD. If this is unset, environment variable gs (Unix, Linux or Mac) or GSC (Window) will be searched by name on your path. If gs/GSC is unset, GhostScript executable "gswi64c.exe" and "gswin32c.exe" are tried sequentially. Here is an example on Windows assuming that the gswin32c.exe is installed at C:\\ Program Files\\ gs\\ gs9.06\\ bin. In a R session, please try: ```{r setup, eval=FALSE, echo=TRUE} Sys.setenv(R_GSCMD=file.path("C:", "Program Files", "gs", "gs9.06", "bin", "gswin32c.exe")) ``` # Examples of using motifStack ## plot a DNA sequence logo with different fonts and colors Users can select different fonts and colors to draw the sequence logo. ```{r DNAseqLogo,fig.cap="Plot a DNA sequence logo with different fonts and colors",fig.width=8,fig.height=6} suppressPackageStartupMessages(library(motifStack)) pcm <- read.table(file.path(find.package("motifStack"), "extdata", "bin_SOLEXA.pcm")) pcm <- pcm[,3:ncol(pcm)] rownames(pcm) <- c("A","C","G","T") motif <- new("pcm", mat=as.matrix(pcm), name="bin_SOLEXA") ##pfm object #motif <- pcm2pfm(pcm) #motif <- new("pfm", mat=motif, name="bin_SOLEXA") opar<-par(mfrow=c(4,1)) plot(motif) #plot the logo with same height plot(motif, ic.scale=FALSE, ylab="probability") #try a different font plot(motif, font="mono,Courier") #try a different font and a different color group motif@color <- colorset(colorScheme='basepairing') plot(motif,font="Times") par(opar) ``` ## plot a RNA sequence logo To plot a RNA sequence logo, you only need to change the rowname of the matrix from "T" to "U" as follows. ```{r RNAseqLogo,fig.cap="Plot an RNA sequence logo",fig.width=6,fig.height=3} rna <- pcm rownames(rna)[4] <- "U" motif <- new("pcm", mat=as.matrix(rna), name="RNA_motif") plot(motif) ``` ## plot an amino acid sequence logo Given that motifStack allows to use any letters as symbols, it can also be used to draw amino acid sequence logos. ```{r AAseqLogo,fig.cap="Plot an sequence logo with any symbols as you want such as amino acid sequence logo",fig.width=6,fig.height=3} library(motifStack) protein<-read.table(file.path(find.package("motifStack"),"extdata","cap.txt")) protein<-t(protein[,1:20]) motif<-pcm2pfm(protein) motif<-new("pfm", mat=motif, name="CAP", color=colorset(alphabet="AA",colorScheme="chemistry")) plot(motif) ``` ## plot sequence logo stack To show multiple motifs on the same canvas as a sequence logo stack, the distance of motifs need to be calculated first. By default, MotIV\cite{@Eloi2010}::`motifDistances` ( R implementation of STAMP\cite{@Mahony2007}) is used to calculate the distance. After alignment, users can use `plotMotifLogoStack`, `plotMotifLogoStackWithTree` or `plotMotifStackWithRadialPhylog` to draw sequence logos in different layouts. To make it easy to use, we integrated different functionalities into one workflow function named as `motifStack`. ```{r logostack,fig.cap="Plot motifs with sequence logo stack style",fig.width=4,fig.height=6} suppressPackageStartupMessages(library(motifStack)) #####Input##### pcms<-readPCM(file.path(find.package("motifStack"), "extdata"),"pcm$") motifs<-lapply(pcms,pcm2pfm) ## plot stacks #motifStack(motifs, layout="stack", ncex=1.0) ``` ```{r treestack,fig.cap="Sequence logo stack with hierarchical cluster tree",fig.width=5,fig.height=6} ## plot stacks with hierarchical tree #motifStack(motifs, layout="tree") ``` ```{r radialstack,fig.cap="Plot motifs in a radial style when the number of motifs is too much to be shown in a vertical stack",fig.width=6,fig.height=6} ## When the number of motifs is too much to be shown in a vertical stack, ## motifStack can draw them in a radial style. ## random sample from MotifDb library("MotifDb") matrix.fly <- query(MotifDb, "Dmelanogaster") motifs2 <- as.list(matrix.fly) ## use data from FlyFactorSurvey motifs2 <- motifs2[grepl("Dmelanogaster\\-FlyFactorSurvey\\-", names(motifs2))] ## format the names names(motifs2) <- gsub("Dmelanogaster_FlyFactorSurvey_", "", gsub("_FBgn\\d+$", "", gsub("[^a-zA-Z0-9]","_", gsub("(_\\d+)+$", "", names(motifs2))))) motifs2 <- motifs2[unique(names(motifs2))] pfms <- sample(motifs2, 50) ## creat a list of object of pfm motifs2 <- lapply(names(pfms), function(.ele, pfms){new("pfm",mat=pfms[[.ele]], name=.ele)} ,pfms) ## trim the motifs motifs2 <- lapply(motifs2, trimMotif, t=0.4) ## setting colors library(RColorBrewer) color <- brewer.pal(12, "Set3") ## plot logo stack with radial style motifStack(motifs2, layout="radialPhylog", circle=0.3, cleaves = 0.2, clabel.leaves = 0.5, col.bg=rep(color, each=5), col.bg.alpha=0.3, col.leaves=rep(color, each=5), col.inner.label.circle=rep(color, each=5), inner.label.circle.width=0.05, col.outer.label.circle=rep(color, each=5), outer.label.circle.width=0.02, circle.motif=1.2, angle=350) ``` ## plot a sequence logo cloud We can also plot a sequence logo cloud for DNA motifs. ```{r motifCloud,fig.cap="Sequence logo cloud with rectangle packing layout",fig.width=6,fig.height=6} ## assign groups for motifs groups <- rep(paste("group",1:5,sep=""), each=10) names(groups) <- names(pfms) ## assign group colors group.col <- brewer.pal(5, "Set3") names(group.col)<-paste("group",1:5,sep="") ## use MotIV to calculate the distances of motifs jaspar.scores <- MotIV::readDBScores(file.path(find.package("MotIV"), "extdata", "jaspar2010_PCC_SWU.scores")) d <- MotIV::motifDistances(lapply(pfms, pfm2pwm)) hc <- MotIV::motifHclust(d, method="average") ## convert the hclust to phylog object phylog <- hclust2phylog(hc) ## reorder the pfms by the order of hclust leaves <- names(phylog$leaves) pfms <- pfms[leaves] ## create a list of pfm objects pfms <- lapply(names(pfms), function(.ele, pfms){ new("pfm",mat=pfms[[.ele]], name=.ele)} ,pfms) ## extract the motif signatures motifSig <- motifSignature(pfms, phylog, groupDistance=0.01, min.freq=1) ## draw the motifs with a tag-cloud style. motifCloud(motifSig, scale=c(6, .5), layout="rectangles", group.col=group.col, groups=groups, draw.legend=TRUE) ``` ## plot grouped sequence logo Grouped sequence logo can also be plotted in radial phylogeny tree style. ```{r motifRadialPhylog,fig.cap="Grouped sequence logo with radialPhylog style layout",fig.width=6,fig.height=6} ## get the signatures from object of motifSignature sig <- signatures(motifSig) ## set the inner-circle color for each signature gpCol <- sigColor(motifSig) ## plot the logo stack with radial style. plotMotifStackWithRadialPhylog(phylog=phylog, pfms=sig, circle=0.4, cleaves = 0.3, clabel.leaves = 0.5, col.bg=rep(color, each=5), col.bg.alpha=0.3, col.leaves=rep(rev(color), each=5), col.inner.label.circle=gpCol, inner.label.circle.width=0.03, angle=350, circle.motif=1.2, motifScale="logarithmic") ``` ## motifCircos We can also plot it with circos style. In circos style, we can plot two group of motifs and with multiple color rings. ```{r motifCircos,fig.cap="Grouped sequence logo with circos style layout",fig.width=6,fig.height=6} ## plot the logo stack with radial style. motifCircos(phylog=phylog, pfms=pfms, pfms2=sig, col.tree.bg=rep(color, each=5), col.tree.bg.alpha=0.3, col.leaves=rep(rev(color), each=5), col.inner.label.circle=gpCol, inner.label.circle.width=0.03, col.outer.label.circle=gpCol, outer.label.circle.width=0.03, r.rings=c(0.02, 0.03, 0.04), col.rings=list(sample(colors(), 50), sample(colors(), 50), sample(colors(), 50)), angle=350, motifScale="logarithmic") ``` ## motifPiles We can also plot the motifs in pile style. In pile style, we can plot two group of motifs with multiple types of annotation. ```{r motifPiles,fig.cap="Grouped sequence logo with piles style layout",fig.width=6,fig.height=6} ## plot the logo stack with radial style. motifPiles(phylog=phylog, pfms=pfms, pfms2=sig, col.tree=rep(color, each=5), col.leaves=rep(rev(color), each=5), col.pfms2=gpCol, r.anno=c(0.02, 0.03, 0.04), col.anno=list(sample(colors(), 50), sample(colors(), 50), sample(colors(), 50)), motifScale="logarithmic", plotIndex=TRUE, groupDistance=0.01) ``` # plot motifs with d3.js Interactive plot can be generated using `browseMotifs` function which leverages the __d3.js__ library. All motifs on the plot are draggable and the plot can be easily exported as a **Scalable Vector Graphics (SVG)** file. ```{r browseMotifs,fig.width=8,fig.height=8} browseMotifs(pfms = pfms, phylog = phylog, layout="tree", yaxis = FALSE, baseWidth=6, baseHeight = 15) ``` Plot the motifs in radialPhylog layout. ```{r browseMotifsRadialPhylog,fig.width=8,fig.height=8} browseMotifs(pfms = pfms, phylog = phylog, layout="radialPhylog", yaxis = FALSE, xaxis = FALSE, baseWidth=6, baseHeight = 15) ``` # docker container for motifStack [Docker](https://docs.docker.com/) container allows software to be packaged into containers which can be run in any platform using a virtual machine called boot2docker. Users can download the [motifStack docker](https://hub.docker.com/r/jianhong/motifstack_1.13.6/) image using the following code snippet.
docker pull jianhong/motifstack_1.13.6
cd ~ ## in windows, please try cd c:\\ Users\\ username
mkdir tmp4motifstack ## this will be the share folder for your host and container.
docker run -ti --rm -v ${PWD}/tmp4motifstack:/volume/data jianhong/motifstack_1.13.6 R
## in R
setwd("/tmp")
library(motifStack)
packageVersion("motifStack")
pcmpath <- "pcmsDatasetFly"
pcms <- readPCM(pcmpath)
pfms <- lapply(pcms, pcm2pfm)
matalign_path <- "/usr/bin/matalign"
neighbor_path <- "/usr/bin/phylip/neighbor"
outpath <- "output"
system(paste("perl MatAlign2tree.pl --in . --pcmpath", pcmpath, "--out", outpath,
    "--matalign", matalign_path, "--neighbor", neighbor_path, "--tree","UPGMA"))
newickstrUPGMA <- readLines(con=file.path(outpath, "NJ.matalign.distMX.nwk"))
phylog <- newick2phylog(newickstrUPGMA, FALSE)
leaves <- names(phylog$leaves)
motifs <- pfms[leaves]
motifSig <- motifSignature(motifs, phylog, groupDistance=2, min.freq=1, trim=.2)
sig <- signatures(motifSig)
gpCol <- sigColor(motifSig)
leaveNames <- gsub("^Dm_", "", leaves)
pdf("/volume/data/test.pdf", width=8, height=11)
motifPiles(phylog=phylog, DNAmotifAlignment(motifs), sig, 
    col.pfms=gpCol, col.pfms.width=.01, 
    col.pfms2=gpCol, col.pfms2.width=.01, 
    labels.leaves=leaveNames, 
    plotIndex=c(FALSE, TRUE), IndexCex=1, 
    groupDistance=2, clabel.leaves=1)
dev.off()
You will see the test.pdf file in the folder of tmp4motifstack. # Session Info ```{r sessionInfo} sessionInfo() ```