\name{lproks} \alias{lproks} \docType{data} \title{ Microbial genome projects at NCBI} \description{ Microbial genomes from Entrez genome project at NCBI. } \usage{data(lproks)} \format{ A genomes data frame with observations on the following 31 variables. \describe{ \item{\code{pid}}{ genome project id} \item{\code{name}}{taxonomy name} \item{\code{status}}{ sequencing status, Complete, Assemby, or In Progress genomes} \item{\code{released}}{released date, complete and WGS genomes only} \item{\code{refseq_pid}}{ RefSeq project id} \item{\code{taxid}}{taxonomy id} \item{\code{kingdom}}{kingdom} \item{\code{group}}{phylum or class} \item{\code{size}}{genome size (Mbp)} \item{\code{GC}}{percent GC content} \item{\code{chromosomes}}{number of chromosomes, complete genomes only} \item{\code{plasmids}}{number of plasmids, complete genomes only} \item{\code{modified}}{modified date, complete genomes only} \item{\code{genbank}}{comma-separated list of GenBank accession numbers} \item{\code{refseq}}{comma-separated list of RefSeq accession numbers} \item{\code{publication}}{comma-separated list of PubMed ids, complete genomes only} \item{\code{center}}{pipe-separated list of sequencing centers} \item{\code{contigs}}{number of genome contigs. For complete genomes, contigs are the sum of chromosomes and plasmids } \item{\code{cds}}{ number of coding sequences, WGS only } \item{\code{url}}{sequencing center url, WGS and In Progress genomes only } \item{\code{gram}}{gram stain } \item{\code{shape}}{ shape } \item{\code{arrange}}{ arrangement} \item{\code{endospore}}{ endospores } \item{\code{motility}}{ motility} \item{\code{salinity}}{ salinity} \item{\code{oxygen}}{ oxygen requirement} \item{\code{habitat}}{ habitat} \item{\code{temp}}{ temperature preference } \item{\code{range}}{ temperature range} \item{\code{pathogen}}{ pathogenic in host } \item{\code{disease}}{ disease} } } \details{ This table is constructed using all three tabs at \url{http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi}. Complete genomes and In Progress tabs are combined and then joined to the Organism Info tab. A few manual updates were also added: 725 missing released dates from GenBank assemblies were added, 178 complete genomes with assembly released dates were corrected (see \code{\link{complete}}), and genome size outliers were removed. The \code{update(genomes)} function downloads a recent copy of the table from NCBI. The number of new project IDs are reported as well as the number of project IDs removed (which are typically Assembly genomes that are now available as a Complete sequence). This table is no longer supported by NCBI. See \url{http://www.ncbi.nlm.nih.gov/About/news/17Nov2011.html} for details. } \source{ downloaded from \url{http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi} } % \references{} \examples{ data(lproks) lproks #single row (long format) t(lproks[1,]) class(lproks) ## download stats attributes(lproks)[c("stats", "date","url")] summary(lproks) ## check for missing release dates table2(!is.na(lproks$released), lproks$status, dnn=list("Released Date?", "Status")) plot(lproks) plotby(lproks, log='y', las=1) ## download recent table from NCBI \dontrun{update(lproks)} ## Yersinia genomes yp <- subset(lproks, name \%like\% 'Yersinia*') yp summary(yp) plotby(yp, labels=TRUE, cex=.7, lbty='n') } \keyword{datasets}