## ----echo=FALSE------------------------------------------------------------
library(specL)

exampledata <- ms1.p2069


## --------------------------------------------------------------------------

# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
  newfrom <- NULL
  for(from in windfrom){
    idx <- which.min(abs(from - empty))
    startmass <- 0
    if(isfrom){
      if(empty[idx] < from) {
        startmass <- empty[idx]
      } else {
        startmass <- empty[idx-1]
      }
    }else{
      if(empty[idx] > from) {
        startmass <- empty[idx]
      } else {
        startmass <- empty[idx+1]
      }
    }
    newfrom <- c(newfrom, round(startmass,digits=1))
  }
  return(newfrom)
}

.cdsw_compute_breaks <- 
  function(xx, nbins){
    q <- quantile(xx, seq(0, 1, length = nbins + 1))
    q[1] <- q[1] - 0.5
    q[length(q)] <- q[length(q)] + 0.5
    q <- round(q)
  }


cdsw <-
  function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
    if (class(x) == "psmSet") {
      x <- unlist(lapply(x, function(x) {
        x$pepmass
      }))
    } else if (class(x) == 'specLSet') {
      x <- unlist(lapply(x@ionlibrary, function(xx) {
        xx@q1
      }))
    }
    # x should be numeric
    if (class(x) != "numeric") {
      warning("can not compute quantils. 'x' is not numeric.")
      return (NULL)
    }
    
    x <- x[x > massrange[1] & x < massrange[2]]
    
    q <- FUN(xx=x, nbins=n)
    
    idx <- 1:n
    from <- q[idx] - overlap * 0.5
    to <- q[idx + 1] + overlap * 0.5
    width <- 0.5 * (to - from)
    mid <- from + width
    h <- hist(x, breaks = q, ...)
    data.frame(from, to, mid, width, counts = h$counts)
    
    
  }

## ----fig.retina=3, warning=FALSE-------------------------------------------


cdsw(exampledata, 
     freq=TRUE, 
     overlap = 0, 
     main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)

## --------------------------------------------------------------------------

.cdsw_objective <- function(splits, data){
  counts <- hist(data, breaks=splits,plot=FALSE)$counts
  nbins<-length(splits)-1
  optimumN <- length(data)/(length(splits)-1)
  optimumN<-rep(optimumN,nbins)
  
  score2 <-sqrt(sum((counts - optimumN)^2))
  score1 <- sum(abs(counts - round(optimumN)))
  return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}

.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
  
  difsp<-diff(splits)
  return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}


## --------------------------------------------------------------------------



.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
  breaks <- nbins+1
  #xx <- x
  
  #xx<-xx[xx >=310 & xx<1250]
  # TODO(wew): there is something insconsitent with the nbins parameter
  qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
  
  plot(1:breaks, qqs, type="b" , 
       sub=".cdsw_compute_sampling_breaks")
  legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
                               paste("nbins = ", breaks) ))
  
  
  # equidistant spaced bins
  unif <- seq(min(xx),max(xx),length=(breaks))
  lines(1:breaks,unif,col=2,type="b")
  
  if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
    warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
  }else{
    .cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
  }
  
  mixeddata <- xx
  it_count <- 0
  error <- 0
  
  while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
    it_count <- it_count + 1
    uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
    mixeddata<-c(mixeddata,uniformdata)
    
    qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
    lines(1:breaks,qqs,type="l", col="#00DD00AA")
    error[it_count] <-.cdsw_objective(qqs, xx)$score1
    
  }
  
  lines(1:breaks,qqs,type="b", col="#FF1111AA")
  plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
  
  
  qqs <- as.numeric(sort(round(qqs)))
  qqs[1]  <- qqs[1] - 0.5
  qqs[length(qqs)]  <- qqs[length(qqs)] + 0.5
  
  round(qqs, 1)
}

## ----fig.height=8, fig.width=8, fig.retina=3-------------------------------

op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata, 
             freq=TRUE,
             plot=TRUE,
             overlap = 0, 
             n=35,
             massrange = c(350,1250),
             sub='sampling based',
             main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})

readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
  res <- hist(ms1data, breaks=breaks)
  abline(v=wind$from,col=2,lty=2)
  abline(v=wind$to,col=3,lty=2)
  
  empty <- res$mids[which(res$counts < maxbin )]
  newfrom <- .makenewfromto(wind$from , empty)
  newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
  
  plot(res,xlim=c(1060,1065))
  abline(v = newfrom,lwd=0.5,col="red")
  abline(v = newto , lwd=0.5,col="green")
  plot(res,xlim=c(520,550))
  abline(v = newfrom,lwd=0.5,col="red")
  abline(v = newto , lwd=0.5,col="green")
  
  width <- (newto - newfrom) * 0.5
  mid <- (newfrom + newto)*0.5
  newCounts <- NULL
  for(i in 1:length(newfrom))
  {
    newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
  }
  data.frame(newfrom, newto, mid, width, counts =newCounts)

}
readjustWindows(wind,exampledata)


## ----fig.height=8, fig.width=8, fig.retina=3-------------------------------

cdsw(exampledata, 
     freq=TRUE,
     plot=TRUE,
     n=35,
     overlap = 0, 
     sub='quantile based',
     main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)


## ----fig.height=10, fig.width=8, fig.retina=3, warning=FALSE---------------
op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
  cdsw(exampledata, 
       freq=TRUE,
       plot=TRUE,
       overlap = 0, 
       main = paste("max window size", mws), xlab='pepmass', 
       FUN=function(...){
         .cdsw_compute_sampling_breaks(...,maxwindow = mws)
       })
})

## ----fig.height=10, fig.width=8, fig.retina=3, warning=FALSE---------------
op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
  cdsw(exampledata, 
       freq=TRUE,
       plot=TRUE,
       n=nbins,
       overlap = 0, 
       main = paste("nr bins", nbins), xlab='pepmass', 
       FUN=function(...){
         .cdsw_compute_sampling_breaks(...,maxwindow = 100)
       })
})

## ----sessionInfo, echo=FALSE-----------------------------------------------
sessionInfo()