%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Do not modify this file since it was automatically generated from: % % weightedMedian.R % % by the Rdoc compiler part of the R.oo package. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \name{weightedMedian} \alias{weightedMedian.default} \alias{weightedMedian} \encoding{latin1} \title{Weighted Median Value} \usage{\method{weightedMedian}{default}(x, w, na.rm=NA, interpolate=is.null(ties), ties=NULL, method=c("quick", "shell"), ...)} \description{ Computes a weighted median of a numeric vector. } \arguments{ \item{x}{a \code{\link[base]{numeric}} \code{\link[base]{vector}} containing the values whose weighted median is to be computed.} \item{w}{a vector of weights the same length as \code{x} giving the weights to use for each element of \code{x}. Negative weights are treated as zero weights. Default value is equal weight to all values.} \item{na.rm}{a logical value indicating whether \code{\link[base]{NA}} values in \code{x} should be stripped before the computation proceeds, or not. If \code{\link[base]{NA}}, no check at all for \code{\link[base]{NA}}s is done. Default value is \code{\link[base]{NA}} (for effiency).} \item{interpolate}{If \code{\link[base:logical]{TRUE}}, linear interpolation is used to get a consistant estimate of the weighted median.} \item{ties}{If \code{interpolate == FALSE}, a character string specifying how to solve ties between two \code{x}'s that are satisfying the weighted median criteria. Note that at most two values can satisfy the criteria. When \code{ties} is \code{"min"}, the smaller value of the two is returned and when it is \code{"max"}, the larger value is returned. If \code{ties} is \code{"mean"}, the mean of the two values is returned and if it is \code{"both"}, both values are returned. Finally, if \code{ties} is \code{"weighted"} (or \code{\link[base]{NULL}}) a weighted average of the two are returned, where the weights are weights of all values \code{x[i] <= x[k]} and \code{x[i] >= x[k]}, respectively.} \item{method}{If \code{"shell"}, then \code{order()} is used and when \code{method="quick"}, then internal \code{qsort()} is used.} \item{...}{Not used.} } \value{ Returns the weighted median. } \details{ For the \code{n} elements \code{x = c(x[1], x[2], ..., x[n])} with positive weights \code{w = c(w[1], w[2], ..., w[n])} such that \code{sum(w) = S}, the \emph{weighted median} is defined as the element \code{x[k]} for which the total weight of all elements \code{x[i] < x[k]} is less or equal to \code{S/2} and for which the total weight of all elements \code{x[i] > x[k]} is less or equal to \code{S/2} (c.f. [1]). If \code{w} is missing then all elements of \code{x} are given the same positive weight. If all weights are zero, \code{NA} is returned. If one or more weights are \code{Inf}, it is the same as these weights have the same weight and the others has zero. This makes things easier for cases where the weights are result of a division with zero. In this case \code{median()} is used internally. When all the weights are the same (after values with weight zero are excluded and \code{Inf}'s are taken care of), \code{median} is used internally. The weighted median solves the following optimization problem: \deqn{\alpha^* = \arg_\alpha \min \sum_{k=1}{K} w_k |x_k-\alpha|} where \eqn{x=(x_1,x_2,\ldots,x_K)} are scalars and \eqn{w=(w_1,w_2,\ldots,w_K)} are the corresponding "weights" for each individual \eqn{x} value. } \section{Benchmarks}{ When implementing this function speed has been highly prioritized and it also making use of the internal quick sort algorithm (from \R v1.5.0). The result is that \code{weightedMedian(x)} is about half as slow as \code{median(x)}. It is hard to say how much since it depends on the data set, but it is also hard to time it exactly since internal garbage collector etc might mess up the measurements. Initial test also indicates that \code{method="shell"}, which uses \code{order()} is slower than \code{method="quick"}, which uses internal \code{qsort()}. Non-weighted median can use partial sorting which is faster because all values do not have to be sorted. See examples below for some simple benchmarking tests. } \examples{ x <- 1:10 n <- length(x) m1 <- median(x) # 5.5 m2 <- weightedMedian(x) # 5.5 stopifnot(identical(m1, m2)) w <- rep(1, n) m1 <- weightedMedian(x, w) # 5.5 (default) m2 <- weightedMedian(x, ties="weighted") # 5.5 (default) m3 <- weightedMedian(x, ties="min") # 5 m4 <- weightedMedian(x, ties="max") # 6 stopifnot(identical(m1,m2)) # Pull the median towards zero w[1] <- 5 m1 <- weightedMedian(x, w) # 3.5 y <- c(rep(0,w[1]), x[-1]) # Only possible for integer weights m2 <- median(y) # 3.5 stopifnot(identical(m1,m2)) # Put even more weight on the zero w[1] <- 8.5 weightedMedian(x, w) # 2 # All weight on the first value w[1] <- Inf weightedMedian(x, w) # 1 # All weight on the last value w[1] <- 1 w[n] <- Inf weightedMedian(x, w) # 10 # All weights set to zero w <- rep(0, n) weightedMedian(x, w) # NA # Simple benchmarking bench <- function(N=1e5, K=10) { x <- rnorm(N) t <- c() gc() t[1] <- system.time(for (k in 1:K) median(x))[3] gc() t[2] <- system.time(for (k in 1:K) weightedMedian(x, method="quick"))[3] gc() t[3] <- system.time(for (k in 1:K) weightedMedian(x, method="shell"))[3] t <- t / t[1] t[4] <- t[2]/t[3] names(t) <- c("median", "wMed-quick", "wMed-shell", "quick/shell") t } print(bench(N= 5, K=1000)) print(bench(N=100, K=1000)) print(bench(N=1e3, K=100)) print(bench(N=1e5, K=10)) print(bench(N=1e6, K=1)) } \seealso{ \code{\link[stats]{median}}, \code{\link[base]{mean}}() and \code{\link[stats]{weighted.mean}}. } \references{ [1] T.H. Cormen, C.E. Leiserson, R.L. Rivest, Introduction to Algorithms, The MIT Press, Massachusetts Institute of Technology, 1989. } \author{ Henrik Bengtsson and Ola \enc{Hössjer}{Hossjer}, Centre for Mathematical Sciences, Lund University. Thanks to Roger Koenker, Econometrics, University of Illinois, for the initial ideas. } \keyword{univar} \keyword{robust}