\name{MultipleAlignment-class} \docType{class} % Classes: \alias{class:MultipleAlignment} \alias{MultipleAlignment-class} \alias{MultipleAlignment} \alias{class:DNAMultipleAlignment} \alias{DNAMultipleAlignment-class} \alias{DNAMultipleAlignment} \alias{class:RNAMultipleAlignment} \alias{RNAMultipleAlignment-class} \alias{RNAMultipleAlignment} \alias{class:AAMultipleAlignment} \alias{AAMultipleAlignment-class} \alias{AAMultipleAlignment} % Accessor-like methods: \alias{unmasked,MultipleAlignment-method} \alias{rownames,MultipleAlignment-method} \alias{rownames<-,MultipleAlignment-method} \alias{rowmask} \alias{rowmask,MultipleAlignment-method} \alias{rowmask<-} \alias{rowmask<-,MultipleAlignment,NULL-method} \alias{rowmask<-,MultipleAlignment,NormalIRanges-method} \alias{rowmask<-,MultipleAlignment,ANY-method} \alias{colmask} \alias{colmask,MultipleAlignment-method} \alias{colmask<-} \alias{colmask<-,MultipleAlignment,NULL-method} \alias{colmask<-,MultipleAlignment,NormalIRanges-method} \alias{colmask<-,MultipleAlignment,ANY-method} \alias{maskMotif,MultipleAlignment,ANY-method} \alias{maskGaps} \alias{maskGaps,MultipleAlignment-method} \alias{nrow,MultipleAlignment-method} \alias{ncol,MultipleAlignment-method} \alias{dim,MultipleAlignment-method} \alias{maskednrow} \alias{maskednrow,MultipleAlignment-method} \alias{maskedncol} \alias{maskedncol,MultipleAlignment-method} \alias{maskeddim} \alias{maskeddim,MultipleAlignment-method} \alias{maskedratio,MultipleAlignment-method} \alias{nchar,MultipleAlignment-method} \alias{xsbasetype,MultipleAlignment-method} % Read functions: \alias{read.DNAMultipleAlignment} \alias{read.RNAMultipleAlignment} \alias{read.AAMultipleAlignment} % Write functions: \alias{write.phylip} % Coercion: \alias{coerce,MultipleAlignment,DNAStringSet-method} \alias{coerce,MultipleAlignment,RNAStringSet-method} \alias{coerce,MultipleAlignment,AAStringSet-method} \alias{coerce,MultipleAlignment,BStringSet-method} \alias{coerce,character,DNAMultipleAlignment-method} \alias{coerce,character,RNAMultipleAlignment-method} \alias{coerce,character,AAMultipleAlignment-method} \alias{as.character,MultipleAlignment-method} \alias{as.matrix,MultipleAlignment-method} % Utilities: \alias{consensusMatrix,MultipleAlignment-method} \alias{consensusString,MultipleAlignment-method} \alias{consensusString,DNAMultipleAlignment-method} \alias{consensusString,RNAMultipleAlignment-method} \alias{consensusString,AAMultipleAlignment-method} \alias{consensusViews} \alias{consensusViews,MultipleAlignment-method} \alias{consensusViews,DNAMultipleAlignment-method} \alias{consensusViews,RNAMultipleAlignment-method} \alias{consensusViews,AAMultipleAlignment-method} \alias{alphabetFrequency,MultipleAlignment-method} % show style methods: \alias{show,MultipleAlignment-method} \alias{detail,MultipleAlignment-method} \title{MultipleAlignment objects} \description{ The MultipleAlignment class is a container for storing multiple sequence alignments. } \usage{ ## Constructors: DNAMultipleAlignment(x=character(), start=NA, end=NA, width=NA, use.names=TRUE, rowmask=NULL, colmask=NULL) RNAMultipleAlignment(x=character(), start=NA, end=NA, width=NA, use.names=TRUE, rowmask=NULL, colmask=NULL) AAMultipleAlignment(x=character(), start=NA, end=NA, width=NA, use.names=TRUE, rowmask=NULL, colmask=NULL) ## Read functions: read.DNAMultipleAlignment(filepath, format) read.RNAMultipleAlignment(filepath, format) read.AAMultipleAlignment(filepath, format) ## Write funtions: write.phylip(x, filepath) ## ... and more (see below) } \arguments{ \item{x}{ Either a character vector (with no NAs), or an \link{XString}, \link{XStringSet} or \link{XStringViews} object containing strings with the same number of characters. If writing out a Phylip file, then x would be a \link{MultipleAlignment} object } \item{start,end,width}{ Either \code{NA}, a single integer, or an integer vector of the same length as \code{x} specifying how \code{x} should be "narrowed" (see \code{?\link[IRanges:Ranges-utils]{narrow}} for the details). } \item{use.names}{ \code{TRUE} or \code{FALSE}. Should names be preserved? } \item{filepath}{ A character vector (of arbitrary length when reading, of length 1 when writing) containing the paths to the files to read or write. Note that special values like \code{""} or \code{"|cmd"} (typically supported by other I/O functions in R) are not supported here. Also \code{filepath} cannot be a connection. } \item{format}{ Either \code{"fasta"} (the default), \code{stockholm}, or \code{"clustal"}. } \item{rowmask}{ a NormalIRanges object that will set masking for rows } \item{colmask}{ a NormalIRanges object that will set masking for columns } } \details{ The MultipleAlignment class is designed to hold and represent multiple sequence alignments. The rows and columns within an alignment can be masked for ad hoc analyses. } \section{Accessor methods}{ In the code snippets below, \code{x} is a MultipleAlignment object. \describe{ \item{}{ \code{unmasked(x)}: The underlying \link{XStringSet} object containing the multiple sequence alignment. } \item{}{ \code{rownames(x)}: \code{NULL} or a character vector of the same length as \code{x} containing a short user-provided description or comment for each sequence in \code{x}. } \item{}{ \code{rowmask(x)}, \code{rowmask(x, append, invert) <- value}: Gets and sets the \link{NormalIRanges} object representing the masked rows in \code{x}. The \code{append} argument takes \code{union}, \code{replace} or \code{intersect} to indicate how to combine the new \code{value} with \code{rowmask(x)}. The \code{invert} argument takes a logical argument to indicate whether or not to invert the new mask. The \code{value} argument can be of any class that is coercible to a \link{NormalIRanges} via the \code{as} function. } \item{}{ \code{colmask(x)}, \code{colmask(x, append, invert) <- value}: Gets and sets the \link{NormalIRanges} object representing the masked columns in \code{x}. The \code{append} argument takes \code{union}, \code{replace} or \code{intersect} to indicate how to combine the new \code{value} with \code{colmask(x)}. The \code{invert} argument takes a logical argument to indicate whether or not to invert the new mask. The \code{value} argument can be of any class that is coercible to a \link{NormalIRanges} via the \code{as} function. } \item{}{ \code{maskMotif(x, motif, min.block.width=1, ...)}: Returns a MultipleAlignment object with a modified column mask based upon motifs found in the consensus string where the consensus string keeps all the columns but drops the masked rows. \describe{ \item{motif}{The motif to mask.} \item{min.block.width}{The minimum width of the blocks to mask.} \item{...}{Additional arguments for \code{matchPattern}.} } } \item{}{ \code{maskGaps(x, min.fraction, min.block.width)}: Returns a MultipleAlignment object with a modified column mask based upon gaps in the columns. In particular, this mask is defined by \code{min.block.width} or more consecutive columns that have \code{min.fraction} or more of their non-masked rows containing gap codes. \describe{ \item{min.fraction}{A value in \code{[0, 1]} that indicates the minimum fraction needed to call a gap in the consensus string (default is \code{0.5}).} \item{min.block.width}{A positive integer that indicates the minimum number of consecutive gaps to mask, as defined by \code{min.fraction} (default is \code{4}).} } } \item{}{ \code{nrow(x)}: Returns the number of sequences aligned in \code{x}. } \item{}{ \code{ncol(x)}: Returns the number of characters for each alignment in \code{x}. } \item{}{ \code{dim(x)}: Equivalent to \code{c(nrow(x), ncol(x))}. } \item{}{ \code{maskednrow(x)}: Returns the number of masked aligned sequences in \code{x}. } \item{}{ \code{maskedncol(x)}: Returns the number of masked aligned characters in \code{x}. } \item{}{ \code{maskeddim(x)}: Equivalent to \code{c(maskednrow(x), maskedncol(x))}. } \item{}{ \code{maskedratio(x)}: Equivalent to \code{maskeddim(x) / dim(x)}. } \item{}{ \code{nchar(x)}: Returns the number of unmasked aligned characters in \code{x}, i.e. \code{ncol(x) - maskedncol(x)}. } \item{}{ \code{alphabet(x)}: Equivalent to \code{alphabet(unmasked(x))}. } } } \section{Coercion}{ In the code snippets below, \code{x} is a MultipleAlignment object. \describe{ \item{}{ \code{as(from, "DNAStringSet")}, \code{as(from, "RNAStringSet")}, \code{as(from, "AAStringSet")}, \code{as(from, "BStringSet")}: Creates an instance of the specified \link{XStringSet} object subtype that contains the unmasked regions of the multiple sequence alignment in \code{x}. } \item{}{ \code{as.character(x, use.names)}: Convert \code{x} to a character vector containing the unmasked regions of the multiple sequence alignment. \code{use.names} controls whether or not \code{rownames(x)} should be used to set the names of the returned vector (default is \code{TRUE}). } \item{}{ \code{as.matrix(x, use.names)}: Returns a character matrix containing the "exploded" representation of the unmasked regions of the multiple sequence alignment. \code{use.names} controls whether or not \code{rownames(x)} should be used to set the row names of the returned matrix (default is \code{TRUE}). } } } \section{Utilities}{ In the code snippets below, x is a MultipleAlignment object. \describe{ \item{}{ \code{consensusMatrix(x, as.prob, baseOnly)}: Creates an integer matrix containing the column frequencies of the underlying alphabet with masked columns being represented with \code{NA} values. If \code{as.prob} is \code{TRUE}, then probabilities are reported, otherwise counts are reported (the default). If \code{baseOnly} is \code{TRUE}, then the non-base letters are collapsed into an \code{"other"} category. } \item{}{ \code{consensusString(x, ...)}: Creates a consensus string for \code{x} with the symbol \code{"#"} representing a masked column. See \code{\link{consensusString}} for details on the arguments. } \item{}{ \code{consensusViews(x, ...)}: Similar to the \code{consensusString} method. It returns a \link{XStringViews} on the consensus string containing subsequence contigs of non-masked columns. Unlike the \code{consensusString} method, the masked columns in the underlying string contain a consensus value rather than the \code{"#"} symbol. } \item{}{ \code{alphabetFrequency(x, as.prob, collapse)}: Creates an integer matrix containing the row frequencies of the underlying alphabet. If \code{as.prob} is \code{TRUE}, then probabilities are reported, otherwise counts are reported (the default). If \code{collapse} is \code{TRUE}, then returns the overall frequency instead of the frequency by row. } \item{}{ \code{detail(x, invertColMask, hideMaskedCols)}: Allows for a full pager driven display of the object so that masked cols and rows can be removed and the entire sequence can be visually inspected. If \code{hideMaskedCols} is set to it's default value of \code{TRUE} then the output will hide all the the masked columns in the output. Otherwise, all columns will be displayed along with a row to indicate the masking status. If \code{invertColMask} is \code{TRUE} then any displayed mask will be flipped so as to represent things in a way consistent with Phylip style files instead of the mask that is actually stored in the \code{MultipleAlignment} object. Please notice that \code{invertColMask} will be ignored if \code{hideMaskedCols} is set to its default value of \code{TRUE} since in that case it will not make sense to show any masking information in the output. Masked rows are always hidden in the output. } } } \author{P. Aboyoun and M. Carlson} \seealso{ \link{XStringSet-class}, \link{MaskedXString-class} } \examples{ ## create an object from file origMAlign <- read.DNAMultipleAlignment(filepath = system.file("extdata", "msx2_mRNA.aln", package="Biostrings"), format="clustal") ## list the names of the sequences in the alignment rownames(origMAlign) ## rename the sequences to be the underlying species for MSX2 rownames(origMAlign) <- c("Human","Chimp","Cow","Mouse","Rat", "Dog","Chicken","Salmon") origMAlign ## See a detailed pager view if (interactive()) { detail(origMAlign) } ## operations to mask rows ## For columns, just use colmask() and do the same kinds of operations rowMasked <- origMAlign rowmask(rowMasked) <- IRanges(start=1,end=3) rowMasked ## remove rowumn masks rowmask(rowMasked) <- NULL rowMasked ## "select" rows of interest rowmask(rowMasked, invert=TRUE) <- IRanges(start=4,end=7) rowMasked ## or mask the rows that intersect with masked rows rowmask(rowMasked, append="intersect") <- IRanges(start=1,end=5) rowMasked ## TATA-masked tataMasked <- maskMotif(origMAlign, "TATA") colmask(tataMasked) ## automatically mask rows based on consecutive gaps autoMasked <- maskGaps(origMAlign, min.fraction=0.5, min.block.width=4) colmask(autoMasked) autoMasked ## calculate frequencies alphabetFrequency(autoMasked) consensusMatrix(autoMasked, baseOnly=TRUE)[, 84:90] ## get consensus values consensusString(autoMasked) consensusViews(autoMasked) ## cluster the masked alignments sdist <- stringDist(as(autoMasked,"DNAStringSet"), method="hamming") clust <- hclust(sdist, method = "single") plot(clust) fourgroups <- cutree(clust, 4) fourgroups ## write out the alignement object (with current masks) to Phylip format write.phylip(x = autoMasked, filepath = tempfile("foo.txt",tempdir())) } \keyword{methods} \keyword{classes}