## ----include = FALSE------------------------------------------------------------------------------ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ------------------------------------------------------------------------------------------------- system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE) ## ------------------------------------------------------------------------------------------------- full_data <- read.delim( system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE), stringsAsFactors = FALSE ) head(colnames(full_data)) ## ------------------------------------------------------------------------------------------------- # I use a regular expression (regex) to select the intensity columns intensity_colnames <- grep("^LFQ\\.intensity\\.", colnames(full_data), value=TRUE) # Create matrix which only contains the intensity columns data <- as.matrix(full_data[, intensity_colnames]) colnames(data) <- sub("^LFQ\\.intensity\\.", "", intensity_colnames) # Code missing values explicitly as NA data[data == 0] <- NA # log transformation to account for mean-variance relation data <- log2(data) # Overview of data data[1:7, 1:6] # Set rownames after showing data, because they are so long rownames(data) <- full_data$Protein.IDs ## ------------------------------------------------------------------------------------------------- annotation_df <- data.frame( Condition = sub("\\.\\d+", "", sub("^LFQ\\.intensity\\.", "", intensity_colnames)), Replicate = as.numeric(sub("^LFQ\\.intensity\\.[[:alnum:]]+\\.", "", intensity_colnames)), stringsAsFactors = FALSE, row.names = colnames(data) ) head(annotation_df) ## ----eval=FALSE, include=TRUE--------------------------------------------------------------------- # # Not Run # library(proDA) # fit <- proDA(data, design= annotation_df$Condition, col_data = annotation_df) # test_diff(fit, contrast = CG1407 - S2R) # # End Not Run ## ----include=FALSE-------------------------------------------------------------------------------- library(SummarizedExperiment) library(MSnbase) ## ------------------------------------------------------------------------------------------------- library(SummarizedExperiment) se <- SummarizedExperiment(SimpleList(LFQ=data), colData=annotation_df) rowData(se) <- full_data[, c("Only.identified.by.site", "Reverse", "Potential.contaminant")] se ## ------------------------------------------------------------------------------------------------- library(MSnbase) fData <- AnnotatedDataFrame(full_data[, c("Only.identified.by.site", "Reverse", "Potential.contaminant")]) rownames(fData) <- rownames(data) ms <- MSnSet(data, pData=AnnotatedDataFrame(annotation_df), fData=fData) ms ## ----eval=FALSE, include=TRUE--------------------------------------------------------------------- # # Not Run # library(proDA) # fit <- proDA(se, design = ~ Condition - 1) # test_diff(fit, contrast = ConditionCG1407 - ConditionS2R) # # End Not Run ## ----include=FALSE-------------------------------------------------------------------------------- library(dplyr) library(stringr) library(readr) library(tidyr) library(tibble) ## ------------------------------------------------------------------------------------------------- library(dplyr) library(stringr) library(readr) library(tidyr) library(tibble) # Or short # library(tidyverse) ## ------------------------------------------------------------------------------------------------- # The read_tsv function works faster and more reliable than read.delim # But it sometimes needs help to identify the right type for each column, # because it looks only at the first 1,000 elements. # Here, I explicitly define the `Reverse` column as a character column full_data <- read_tsv( system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE), col_types = cols(Reverse = col_character()) ) full_data ## ------------------------------------------------------------------------------------------------- # I explicitly call `dplyr::select()` because there is a naming conflict # between the tidyverse and BioConductor packages for `select()` function tidy_data <- full_data %>% dplyr::select(ProteinID=Protein.IDs, starts_with("LFQ.intensity.")) %>% gather(Sample, Intensity, starts_with("LFQ.intensity.")) %>% mutate(Condition = str_match(Sample, "LFQ\\.intensity\\.([[:alnum:]]+)\\.\\d+")[,2]) %>% mutate(Replicate = as.numeric(str_match(Sample, "LFQ\\.intensity\\.[[:alnum:]]+\\.(\\d+)")[,2])) %>% mutate(SampleName = paste0(Condition, ".", Replicate)) tidy_data ## ------------------------------------------------------------------------------------------------- data <- tidy_data %>% mutate(Intensity = ifelse(Intensity == 0, NA, log2(Intensity))) %>% dplyr::select(ProteinID, SampleName, Intensity) %>% spread(SampleName, Intensity) %>% column_to_rownames("ProteinID") %>% as.matrix() data[1:4, 1:7] annotation_df <- tidy_data %>% dplyr::select(SampleName, Condition, Replicate) %>% distinct() %>% arrange(Condition, Replicate) %>% as.data.frame() %>% column_to_rownames("SampleName") annotation_df ## ------------------------------------------------------------------------------------------------- library(SummarizedExperiment) se <- SummarizedExperiment(SimpleList(LFQ=data), colData=annotation_df) rowData(se) <- full_data[, c("Only.identified.by.site", "Reverse", "Potential.contaminant")] se ## ------------------------------------------------------------------------------------------------- library(MSnbase) fData <- AnnotatedDataFrame(full_data[, c("Only.identified.by.site", "Reverse", "Potential.contaminant")]) rownames(fData) <- rownames(data) ms <- MSnSet(data, pData=AnnotatedDataFrame(annotation_df), fData=fData) ms ## ----eval=FALSE, include=TRUE--------------------------------------------------------------------- # # Not Run # library(proDA) # fit <- proDA(se, design = ~ Condition - 1) # test_diff(fit, contrast = ConditionCG1407 - ConditionS2R) # # End Not Run ## ------------------------------------------------------------------------------------------------- sessionInfo()