\name{convert}
\alias{convert}
\title{Efficienctly convert strings of characters into integer codes}
\description{
  Efficienctly convert strings of characters into integer codes.
}
\usage{
convert(source, levels, byrow=FALSE, aslist=FALSE)
}
\arguments{
  \item{source}{Vector of character strings}
  \item{levels}{Vector of characters used to determine levels }
  \item{byrow}{Boolean. If FALSE (the default), return a matrix with
    one column per string.  If TRUE, return a matrix with one row per
    string.}
  \item{aslist}{Boolean, return matrix (FALSE) or list of vectors
    (TRUE).}
}
\details{
  This function efficiently converts character strings containing
  characters into vectors of integers.  Its primary purpose is to allow
  translation of genotypes stored as character vectors, one character
  per genotype, to a factor-coded matrix. The equivalent code using
  \code{factor} is quite a bit slower, as shown by the last section of
  the example below.

  The \code{levels} argument should be a vector of 1-character strings.
  This vector is used to determine the translation.  The index of 
  matching characters provides the returned integer values. Characters
  not present in \code{levels} will be converted to NA's.
}
\value{
  If \code{aslist=TRUE}, the return value is a a list of vectors. Each
  vector will contain the translation of the corresponding input string.

  If \code{aslist=FALSE (the default)}, the return value will be a
  matrix. \code{byrow} controls whether each string is converted into a
  a column (\code{byrow=FALSE}, the default) or row
  (\code{byrow=TRUE}).

  When \code{byrow=FALSE}, each element of the \code{source} vector is
  converted to a column, and the number of rows will be the number of
  characters in the longest element of the \code{source} vector.  Any
  shorter vectors will be padded with NA's.

  When \code{byrow=TRUE} the matrix is created with one row per element
  of the \code{source} vector, etc.
}

\note{
  Only of the first character of each element of \code{levels} is used.
  Any other characters will be ignored.
  }
\author{Gregory R. Warnes \email{warnes@bst.rochester.edu} and Nitin Jain \email{nitin.jain@pfizer.com}}
\seealso{ \code{\link{factor}},  \code{\link{as.factor} } }
\examples{
###
# Toy Genetics Example
##
# 'c' = 'homozygote common allele'
# 'h' = 'heterozygone'
# 'r' = 'homozygote rare allele'
marker.data <- c( m1='cchchrcr', m2='chccccrr')
marker.data

convert(marker.data, c('c','h','r'))

###
# simple test example
###
source <- c(one='abcabcabc', two='abc','ggg',buckle='aaa',my='bbb',
            'shoe  '='bgb  ')
levels <- c('a','b','c','d')

convert(source,levels)
convert(source,levels,aslist=TRUE)
convert(source,levels,byrow=TRUE)


###
# compare efficiency with equivalent code using 'factor'
###
\dontrun{
makestr <- function(n)
  paste(sample(letters, size=n, replace=T), sep='', collapse='')

timeit <- function( expr )
  {
    start <- Sys.time()
    expr
    end <- Sys.time()
    return( as.numeric(end-start ))
  }

# Step 1: create a large set of character strings
x <- unlist(lapply(1:100000, function(x) makestr(1000)))

# Step 2: Time convert  (~17 sec on Intel Xeon 3.0 GHz, 32 GB RAM)
newtime <- timeit( yn <- convert2(x, letters) )
newtime

# old method  (~4.7 min on Intex Xeon 3.0 GHz, 32 GB RAM)
oldmethod <- function(x)
  {
    yo <- factor(unlist(strsplit(x, split='')),levels=letters)
    attr(y1,'dim') <- c(nchar(x[1]), length(x))
    class(y1) <- 'matrix'
  }

oldtime <- timeit( oldmethod(x) )
oldtime

# time difference
oldtime - newtime
}

}
\keyword{misc}