# routines for quantization and one-dimensional clustering
# Tom Minka 11/30/01

eps <- 1e-15

# returns the elements of x not indexed by i
not <- function(x,i) {
  if(is.numeric(i)) return(x[-i])
  if(is.character(i)) return(x[setdiff(names(x),i)])
  if(is.logical(i)) return(x[!i])
}

##############################################################################

ncut <- function(x,b) {
  cut(x,b,labels=FALSE,include.lowest=TRUE)
}

# reduce data to n points, by local averaging
# use quantiles so size of each bin is equal
squash <- function(x,n=1000) {
  b <- floor(length(x)/n)
  rb <- rep(b,n)
  # spread the overflow broadly over the range
  overflow <- length(x) - n*b
  rb <- rb + c(0,diff(trunc(overflow/n*(1:n))))
  # f is 1 1 1 2 2 2 3 3 3 ...
  f <- rep(1:n, rb)
  f <- f[1:length(x)]
  x <- sort(x)
  tapply(x,f,mean)
}

plot.breaks <- function(x,b,scale=1) {
  hb <- break.equal(x,40)
  h <- hist(x,hb,freq=FALSE,col="bisque",main="",xlab="")
  top <- max(h$density)
  par(lwd=2)
  segments(b,rep(0,length(b)), b,rep(top*scale,length(b)), col="blue")
  par(lwd=1)
}

plot.ts.breaks <- function(x,b,scale=1) {
  plot(x,type="l")
  r <- range(x)*scale
  par(lwd=2)
  segments(b,rep(r[1],length(b)), b,rep(r[2],length(b)), col="blue")
  par(lwd=1)
}

plot.hclust.breaks <- function(hc,x,n=2:5) {
  hb <- break.equal(x,40)
  h <- hist(x,hb,freq=FALSE,col="bisque",main="",xlab="")
  top <- max(h$density)
  par(lwd=3)
  for(i in 1:length(n)) {
    q <- cutree(hc,n[i])
    b <- break.cut(x,q)
    scale <- (length(n)-i+1)/length(n)
    segments(b,rep(0,length(b)), b,rep(top*scale,length(b)), col="blue")
  }
  par(lwd=1)
}

# evenly-spaced breaks
break.equal <- function(x,n=2) {
  r <- range(x)
  seq(0,1,by=1/n)*diff(r) + r[1]
}

# this is "equal-frequency" binning
break.quantile <- function(x,n=2,plot=TRUE) {
  b <- as.numeric(quantile(x, seq(0,1,by=1/n)))
  if(plot) plot.breaks(x,b)
  b
}

# convert a vector of cluster clusters into a break vector
break.centers <- function(x,m) {
  m <- sort(m)
  n <- length(m)
  b <- (m[1:n-1]+m[2:n])/2
  r <- range(x)
  c(r[1]-eps,b,r[2])
}

library(mva)
break.kmeans <- function(x,n=2,plot=T) {
  km <- kmeans(x,n)
  m <- as.numeric(km$centers)
  ss <- sum(km$withinss)
  if(plot) cat("sum of squares =", format(ss), "\n")
  b <- break.centers(x,m)
  if(plot) plot.breaks(x,b)
  b
}

# divide the range of x into n pieces, based on the largest spaces
# returns a vector of breaks, suitable for input to "hist" or "cut"
# tends to overdivide low-density regions
break.diff <- function(x,n=2) {
  n <- n-1
  len <- length(x)
  x <- sort(x)
  diff <- x[2:len] - x[1:len-1]
  i <- order(-diff)
  mid <- (x[2:len] + x[1:len-1])/2
  b <- sort(mid[i[1:n]])

  split.screen(c(2,1))
  split.screen(c(1,2),screen=1)
  screen(4)
  par(mar=c(2,4,1,1))
  last <- n+10
  plot(diff[i[1:last]],ylab="diff")
  m <- (diff[i[n]] + diff[i[n+1]])/2
  #arrows(1,m, last,m, col="black",code=0,lty=2)
  segments(n+0.5, diff[i[last]], n+0.5, diff[i[1]], col="black",lty=2)

  screen(2)
  par(mar=c(2,4,1,1))
  plot(x)
  segments(0,b, len+1,b, col="blue",lty=2)

  screen(3)
  par(mar=c(2,4,1,1))
  hist(x,30,col="bisque",main="")
  segments(b,rep(0,n), b,rep(100,n), col="blue")
  close.screen(all = TRUE)

  c(x[1]-eps,b,x[len])
}

# convert a cut of x into breaks
# f is a factor which divides x into disjoint groups
break.cut <- function(x,f) {
  low <- sort(as.numeric(tapply(x,f,min)))
  high <- sort(as.numeric(tapply(x,f,max)))
  n <- length(high)
  b <- (low[2:n]+high[1:n-1])/2
  c(low[1]-eps,b,high[n])
}

scatter <- function(x) {
  sum((x - mean(x))^2)
}

# "ward" is best
# "ave","centroid" are good
# "single", "complete","median","mcq" are bad
break.hclust2 <- function(x,n=2,method="ward") {
  h <- hclust(dist(x)^2,method)
  q <- cutree(h,n)
  ss <- sum(tapply(x,q,scatter))
  cat("sum of squares =", format(ss), "\n")
  b <- break.cut(x,q)

  split.screen(c(2,1))
  plot.hclust.trace(h)
  screen(2)
  #plot.breaks(x,b)
  plot.hclust.breaks(h,x,n=2:n)
  close.screen(all=TRUE)
  b
}

# Create a hierarchy by Ward's method
# Merge clusters to minimize the sum of squares criterion
# x is a vector (one-dimensional clustering only)
# if x is a vector of batch means, n is the size of each batch
# and s is the sum of squares in each batch (only needed if same.var=F)
# if same.var=F, the adjacency constraint may give suboptimal results
sstree <- function(x,n=rep(1,length(x)),s=rep(1,length(x)),
                   sortx=TRUE,same.var=T) {
  # don't sort and it becomes time-series clustering
  if(sortx) {
    ord <- order(x)
    x <- sort(x)
  } else {
    ord <- 1:length(x)
  }
  height <- c()
  label <- -(1:length(x))
  label <- label[ord]
  merge <- matrix(0,length(x)-1,2)
  if(length(x) == 1) ks <- c()
  else ks <- 1:(length(x)-1)
  for(k in ks) {
    len <- length(x)
    # here we exploit the fact that the data is one-dimensional.
    # only clusters which are adjacent in sorted order need be considered
    # for merging.
    x1 <- x[1:len-1]
    x2 <- x[2:len]
    n1 <- n[1:len-1]
    n2 <- n[2:len]
    # merging cost
    cost <- n1*n2/(n1+n2)*(x1-x2)^2
    if(!same.var) {
      s1 <- s[1:len-1]
      s2 <- s[2:len]
      cost <- (n1+n2)*log((s1+s2+cost)/(n1+n2)) - n1*log(s1/n1) - n2*log(s2/n2)
    }
    j <- which.min(cost)

    if(!same.var) {
      s[j] <- s[j] + s[j+1] + 
              n[j]*n[j+1]/(n[j]+n[j+1])*(x[j]-x[j+1])^2
      s <- not(s,j+1)
    }

    x[j] <- (x[j]*n[j] + x[j+1]*n[j+1])/(n[j] + n[j+1])
    n[j] <- n[j] + n[j+1]
    x <- not(x,j+1)
    n <- not(n,j+1)

    height[k] <- min(cost)
    merge[k,1] <- label[j]
    merge[k,2] <- label[j+1]
    label[j] <- k
    label <- not(label,j+1)
  }
  hc <- list(merge=merge,height=cumsum(height),order=ord,labels=NULL,
             method="ward")
  class(hc) <- "hclust"
  hc
}

plot.hclust.trace <- function(h,k=1:10) {
  g <- rev(h$height)
  k <- k[k < length(g)]
  #g <- sqrt(g)
  if(h$method != "single") {
    g <- -diff(g)
  }
  plot(k,g[k],ylab="merging cost",xlab="clusters")
}

break.hclust <- function(x,n=2,plot=T) {
  h <- sstree(x)
  q <- cutree(h,n)
  ss <- sum(tapply(x,q,scatter))
  if(plot) cat("sum of squares =", format(ss), "\n")
  b <- break.cut(x,q)

  if(plot) {
    split.screen(c(1,2))
    plot.hclust.trace(h)
    screen(2)
    #plot.breaks(x,b)
    plot.hclust.breaks(h,x,n=2:n)
    close.screen(all=TRUE)
  }
  b
}

# convert a cut of x into a changepoint plot
# f is a factor which divides x into disjoint groups
plot.chpt <- function(x,f) {
  plot(x,type="l",xlab="time",ylab="level")
  i <- 1:length(x)
  m <- tapply(x,f,mean)
  low <- tapply(i,f,min)
  high <- tapply(i,f,max)
  segments(low, m, high, m, col="blue")
}

# Apply Ward's method to find changepoints in a time-series
# x is a vector
break.chpt <- function(x,n=2,trace=TRUE,same.var=T) {
  h <- sstree(x,sortx=FALSE,same.var=same.var)
  q <- cutree(h,n)
  ss <- sum(tapply(x,q,scatter))
  cat("sum of squares =", format(ss), "\n")

  if(length(h$height) < 2) trace<-F
  if(trace) {
    split.screen(c(2,1))
    plot.hclust.trace(h)
    screen(2)
  }
  plot.chpt(x,q)
  points(x)
  if(trace) close.screen(all=TRUE)
  c(0,which(diff(q)>0),length(x)+1)
}