## ----include=FALSE-------------------------------------------------------
library(knitr)
opts_chunk$set(
concordance=TRUE,
cache=TRUE
)

## ----results='hide'------------------------------------------------------
# creation of a directory for LEA analyses
dir.create("LEA_analyses")
# set the created directory as the working directory
setwd("LEA_analyses")

## ----results='hide'------------------------------------------------------
library(LEA)
# Creation a the genotypic file: "genotypes.lfmm"
# The data include 400 SNPs for 50 individuals.
data("tutorial")
# Write genotypes in the lfmm format
write.lfmm(tutorial.R, "genotypes.lfmm")
# Write genotypes in the geno format
write.geno(tutorial.R, "genotypes.geno")
# creation of an environment gradient file: gradient.env.
# The .env file contains a single ecological variable
# for each individual.
write.env(tutorial.C, "gradients.env")

## ----results='hide'------------------------------------------------------
# run of pca
# Available options, K (the number of PCs), 
#                    center and scale. 
# Create files: genotypes.eigenvalues - eigenvalues,        
#               genotypes.eigenvectors - eigenvectors,
#               genotypes.sdev - standard deviations,
#               genotypes.projections - projections,
# Create a pcaProject object: pc.
pc = pca("genotypes.lfmm", scale = TRUE)

## ----results='hide'------------------------------------------------------
# Perfom Tracy-Widom tests on all eigenvalues.
# create file: tuto.tracyWidom - tracy-widom test information.  
tw = tracy.widom(pc)

## ----results='asis'------------------------------------------------------
# display p-values for the Tracy-Widom tests (first 5 pcs). 
tw$pvalues[1:5]

## ----fig.width=4, fig.height=4, echo=TRUE--------------------------------
# plot the percentage of variance explained by each component
plot(tw$percentage)

## ----results='hide'------------------------------------------------------
# main options
# K = number of ancestral populations
# entropy = TRUE: computes the cross-entropy criterion, 
# CPU = 4 the number of CPUs.
project = NULL
project = snmf("genotypes.geno",
               K = 1:10, 
               entropy = TRUE, 
               repetitions = 10,
               project = "new")

## ----fig.width=4, fig.height=4, echo=TRUE--------------------------------
# plot cross-entropy criterion for all runs in the snmf project
plot(project, col = "blue", pch = 19, cex = 1.2)

## ----fig.width=10, fig.height=4, echo=TRUE-------------------------------
# select the best run for K = 4
best = which.min(cross.entropy(project, K = 4))
my.colors <- c("tomato", "lightblue", 
               "olivedrab", "gold")
barchart(project, K = 4, run = best,
        border = NA, space = 0, 
        col = my.colors, 
        xlab = "Individuals",
        ylab = "Ancestry proportions",
        main = "Ancestry matrix") -> bp
axis(1, at = 1:length(bp$order), 
     labels = bp$order, las=1, 
     cex.axis = .3)

## ----fig.width=8, fig.height=6, echo=TRUE, results='hide'----------------
# Population differentiation tests
p = snmf.pvalues(project, 
                 entropy = TRUE, 
                 ploidy = 2, 
                 K = 4)
pvalues = p$pvalues
par(mfrow = c(2,1))
hist(pvalues, col = "orange")
plot(-log10(pvalues), pch = 19, col = "blue", cex = .7)

## ------------------------------------------------------------------------
# creation of a genotypic matrix  with missing genotypes
dat = as.numeric(tutorial.R)
dat[sample(1:length(dat), 100)] <-  9
dat <- matrix(dat, nrow = 50, ncol = 400)
write.lfmm(dat, "genotypeM.lfmm")

## ----results='hide'------------------------------------------------------
project.missing = snmf("genotypeM.lfmm", K = 4, 
        entropy = TRUE, repetitions = 10,
        project = "new")

## ------------------------------------------------------------------------
# select the run with the lowest cross-entropy value
best = which.min(cross.entropy(project.missing, K = 4))

# Impute the missing genotypes
impute(project.missing, "genotypeM.lfmm", 
       method = 'mode', K = 4, run = best)

# Proportion of correct imputation results
dat.imp = read.lfmm("genotypeM.lfmm_imputed.lfmm")
mean( tutorial.R[dat == 9] == dat.imp[dat == 9] )

## ----results='hide'------------------------------------------------------
# main options: 
# K the number of latent factors
# Runs with K = 6 and 5 repetitions.
project = NULL
project = lfmm("genotypes.lfmm", 
               "gradients.env", 
                K = 6, 
                repetitions = 5, 
                project = "new")

## ------------------------------------------------------------------------
# compute adjusted p-values
p = lfmm.pvalues(project, K = 6)
pvalues = p$pvalues

## ----fig.width=8, fig.height=6, echo=TRUE--------------------------------
# GWAS significance test
par(mfrow = c(2,1))
hist(pvalues, col = "lightblue")
plot(-log10(pvalues), pch = 19, col = "blue", cex = .7)

## ------------------------------------------------------------------------
for (alpha in c(.05,.1,.15,.2)) {
    # expected FDR
    print(paste("Expected FDR:", alpha))
    L = length(pvalues)
 
    # return a list of candidates with expected FDR alpha.
    # Benjamini-Hochberg's algorithm:
    w = which(sort(pvalues) < alpha * (1:L) / L)
    candidates = order(pvalues)[w]

    # estimated FDR and True Positive Rate
    Lc = length(candidates)
    estimated.FDR = sum(candidates <= 350)/Lc
    print(paste("Observed FDR:", 
                round(estimated.FDR, digits = 2)))    
    estimated.TPR = sum(candidates > 350)/50
    print(paste("Estimated TPR:", 
                round(estimated.TPR, digits = 2)))  
}

## ----echo=FALSE, results='hide'------------------------------------------
# Copy of the pdf figures in the previous directory 
# for the creation of the vignette.
file.copy(list.files(".", pattern = ".pdf"), "..")