\name{lproks} \alias{lproks} \docType{data} \title{ Microbial genome projects at NCBI} \description{ Microbial genomes from Entrez genome project at NCBI. } \usage{data(lproks)} \format{ A genomes data frame with observations on the following 31 variables. \describe{ \item{\code{pid}}{ genome project id} \item{\code{name}}{taxonomy name} \item{\code{status}}{ sequencing status, Complete, Assemby, or In Progress genomes} \item{\code{released}}{released date, complete and WGS genomes only} \item{\code{taxid}}{taxonomy id} \item{\code{kingdom}}{kingdom} \item{\code{group}}{phylum or class} \item{\code{size}}{genome size (Mbp)} \item{\code{GC}}{percent GC content} \item{\code{chromosomes}}{number of chromosomes, complete genomes only} \item{\code{plasmids}}{number of plasmids, complete genomes only} \item{\code{modified}}{modified date, complete genomes only} \item{\code{genbank}}{comma-separated list of GenBank accession numbers} \item{\code{refseq}}{comma-separated list of RefSeq accession numbers} \item{\code{publication}}{comma-separated list of PubMed ids, complete genomes only} \item{\code{center}}{pipe-separated list of sequencing centers} \item{\code{contigs}}{number of genome contigs. For complete genomes, contigs are the sum of chromosomes and plasmids } \item{\code{cds}}{ number of coding sequences, WGS only } \item{\code{url}}{sequencing center url, WGS and In Progress genomes only } \item{\code{gram}}{gram stain } \item{\code{shape}}{ shape } \item{\code{arrange}}{ arrangement} \item{\code{endospore}}{ endospores } \item{\code{motility}}{ motility} \item{\code{salinity}}{ salinity} \item{\code{oxygen}}{ oxygen requirement} \item{\code{habitat}}{ habitat} \item{\code{temp}}{ temperature preference } \item{\code{range}}{ temperature range} \item{\code{pathogen}}{ pathogenic in host } \item{\code{disease}}{ disease} } } \details{ This table is constructed using all three tabs at \url{http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi}. Complete genomes and In Progress tabs are combined and then joined to the Organism Info tab. Nearly all of the Assembly genomes released after Sept 2009 are missing release dates. If needed, these dates are available from Entrez Genomes (see the example in \code{\link{wgs}}). The \code{ update(genomes)} function downloads a recent copy of the table from NCBI. The number of new project IDs are reported as well as the number of project IDs removed (which are typically Assembly genomes that are now available as a Complete sequence). Please note that NCBI is currently changing how prokaryotic genomes are managed and some changes to these tables are possible (see \url{http://www.ncbi.nlm.nih.gov/genomeprj} for details). } \source{ downloaded from \url{http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi} } % \references{} \examples{ data(lproks) lproks #single row (long format) t(lproks[1,]) class(lproks) ## download stats attributes(lproks)[c("stats", "date","url")] summary(lproks) ## many Assembly genomes are now missing release dates table2(!is.na(lproks$released), lproks$status, dnn=list("Released Date?", "Status")) plot(lproks) plotby(lproks, log='y', las=1) ## download recent table from NCBI \dontrun{update(lproks)} ## Yersinia genomes yp <- subset(lproks, name \%like\% 'Yersinia*') yp summary(yp) plotby(yp, labels=TRUE, cex=.5, lbty='n') } \keyword{datasets}