# Functions for lab10 - maps # by Tom Minka # MINKA=~/R/minka/R # cat lab10e.r $MINKA/*.r > ../www/lab/lab10/lab10.r library(maps,lib.loc=getwd()) formals(map)$wrap = T par(mar = c(0,0,0,0.1)) # Routines for controlling aspect ratio # taken from library(lattice) banking <- function (dx, dy) { if (is.na(dx)) NA else { if (is.list(dx)) { dy <- dx[[2]] dx <- dx[[1]] } if (length(dx) != length(dy)) stop("Non matching lengths") id <- dx != 0 & dy != 0 if (any(id)) { r <- abs(dx[id]/dy[id]) median(r) } else 1 } } # from Splus banking <- function(dx, dy, iter = 100, tolerance = 0.5) { dx <- abs(dx) dy <- abs(dy) if(sum(is.na(dx)) != 0) { warning("infinite in x, no banking") dx[is.na(dx)] <- 0 } if(sum(is.na(dy)) != 0) { warning("infinity in y, no banking") dy[is.na(dy)] <- 0 } if(sum(dx) == 0 || sum(dy) == 0) return(NA) good <- dx > 0 dx <- dx[good] dy <- dy[good] # minka: fixed bug in original (dy/dx) mad <- median(dx/dy) # initial guess ar <- if(mad > 0) mad else 1 radians.to.angle <- 180/pi for(i in 1:iter) { distances <- sqrt(dx^2 + (ar * dy)^2) orientations <- atan(ar * dy, dx) avg <- (radians.to.angle * sum(orientations * distances))/ sum(distances) if(abs(45 - avg) < tolerance) break ar <- ar * (45/avg) } ar } banking.fcn <- function(dx,dy,asp) { # want result to be 45 i = (dx != 0) & (dy != 0) dx = dx[i] dy = dy[i] len = sqrt(dx^2 + (asp*dy)^2) radians.to.angle <- 180/pi orient = abs(atan(asp*dy,dx)*radians.to.angle) #hist(orient) sum(orient*len)/sum(len) } # this function is broken auto.aspect <- function(x,y) { # returns an aspect ratio in user coordinates, suitable for asp= # x and y are vectors # NA specifies a break in the curve, just as in lines() if(is.list(x)) { y = x[[2]] x = x[[1]] } dx = diff(x); dy = diff(y) i <- !is.na(dx) & !is.na(dy) dx <- dx[i]; dy <- dy[i] asp = banking(dx,dy) #print(banking.fcn(dx,dy,asp)) asp } # this version doesn't preserve margins properly set.aspect <- function(aspect,margins=T) { aspect <- c(1, aspect) din <- par("din") pin <- aspect*min(din/aspect) if(margins) { # mai is (bottom,left,top,right) m <- din-pin mai <- c(sum(par("mai")[c(2,4)]), sum(par("mai")[c(1,3)])) pin <- din - pmax(m, mai) } par(pin=pin) } reset.aspect <- function() { mai <- c(sum(par("mai")[c(2,4)]), sum(par("mai")[c(1,3)])) par(pin=par("din")-mai) } # Sets graphics parameters so the next plot has aspect ratio asp. # It is okay to run set.aspect repeatedly without resetting the graphics # parameters. # # To reset: # reset.aspect() # or: # opar <- set.aspect(a); par(opar) # # In R, it is also possible to force a given aspect ratio using # layout(1,width=1,height=aspect,respect=T) # and reset with # layout(1) # However, the aspect ratio it gives is not quite correct. # # Test: # set.aspect(2); plot(runif(100)); reset.aspect() # layout(1,width=1,height=2,respect=T); plot(runif(100)); layout(1) # set.aspect <- function(aspect,margins=T) { fin <- par("fin") a <- aspect*fin[1]/fin[2] # plt is (left,right,bottom,top) if(a > 1) plt <- c(0.5-1/a/2,0.5+1/a/2,0,1) else plt <- c(0,1,0.5-a/2,0.5+a/2) if(margins) { # add room for margins # mai is (bottom,left,top,right) mplt <- par("mai")[c(2,4,1,3)]/c(fin[1],fin[1],fin[2],fin[2]) mplt[c(2,4)] <- 1-mplt[c(2,4)] # compose plt and mplt plt[1:2] <- mplt[1] + (mplt[2]-mplt[1])*plt[1:2] plt[3:4] <- mplt[3] + (mplt[4]-mplt[3])*plt[3:4] } par(plt=plt) } reset.aspect <- function() { if(F) { fin <- par("fin") mplt <- par("mai")[c(2,4,1,3)]/c(fin[1],fin[1],fin[2],fin[2]) mplt[c(2,4)] <- 1-mplt[c(2,4)] par(plt=mplt) } else { # simpler way par(mar=par("mar")) } } # histograms with confidence intervals and bin merging # Tom Minka 9/19/01 bhist.stats <- function(x,b) { if(length(b)==1) { r <- range(x,na.rm=T) b <- seq(0,1,by=1/b)*diff(r) + r[1] } if(is.null(b)) { h <- hist(x, probability=TRUE, plot=FALSE) } else { h <- hist(x, probability=TRUE, plot=FALSE, breaks=b) #print("Using given breaks") } b <- h$breaks len <- length(b) wx <- b[2:len] - b[1:len-1] xm <- h$mids nx <- h$counts + wx/sum(wx)*8 n <- sum(nx) p <- nx/n list(p=p,n=n,nx=nx,xm=xm,wx=wx,h=h,b=b) } # b is starting number of bins # n is final number of bins # note: large b can cause problems, because of the greediness of merging merge.hist <- function(x,b=NULL,n=b,trace=T) { ss <- c() bn <- NULL repeat { bh <- bhist.stats(x,b) p <- bh$p; wx <- bh$wx b <- bh$b if(is.null(bn)) bn <- b # compute scores len <- length(p) p1 <- p[1:len-1] w1 <- wx[1:len-1] f1 <- p1/w1 p2 <- p[2:len] w2 <- wx[2:len] f2 <- p2/w2 fs <- (p1+p2)/(w1+w2) # s is difference in expected log-prob of future data #s <- p1*log(f1/fs) + p2*log(f2/fs) # needs a multiple-comparisons correction, as in CHAID # chi-square statistic s <- w1*(f1-fs)^2/fs + w2*(f2-fs)^2/fs #s <- p1*p2 #s <- (p1 - p2)^2 #s <- p1+p2 ss[len-1] <- min(s) j <- which.min(s) b <- not(b, j+1); # bn is the one they want if(length(b) == n+1) bn <- b if(length(b) <= 3) break } b <- bn if(trace) { split.screen(c(2,1)) plot(1:length(ss),ss^(0.25),xlab="bins",ylab="chi-sq") #plot(2:length(ss),-diff(ss^(0.25)),xlab="bins",ylab="chi-sq") screen(2) } bhist(x,b) title(paste(length(b)-1,"bins")) if(trace) close.screen(all = TRUE) b } bhist <- function(x,b=NULL,z=1.64,xlab="",main="") { bh <- bhist.stats(x,b) p <- bh$p; nx <- bh$nx; n <- bh$n; xm <- bh$xm; wx <- bh$wx; h <- bh$h if(FALSE) { # exact interval # z=1 means q=0.1586 f1 <- qbeta(q, nx, n-nx)/wx f2 <- qbeta(1-q, nx, n-nx)/wx } else { # approx interval s <- sqrt(p*(1-p)/n)*exp(-1/6/nx) f1 <- (p - z*s)/wx f2 <- (p + z*s)/wx } f <- p/wx h$counts <- nx h$intensities <- f h$density <- f plot.histogram(h,col="bisque",freq=FALSE,xlab=xlab,main=main) points(xm, f, col="blue", pch=18) arrows(xm, f1, xm, f2, code = 3, col = "green", angle = 75, length = .1) if(FALSE) { p <- exp(digamma(nx)-digamma(n)) s <- sqrt(1/nx + 1/n) s <- sqrt(trigamma(nx)+trigamma(n)) f1 <- exp(log(p) - z*s)/wx f2 <- exp(log(p) + z*s)/wx } if(FALSE) { # compare intervals s <- sqrt(p*(1-p)/n)*exp(-1/6/nx) f1 <- (p - s)/wx f2 <- (p + s)/wx arrows(xm+0.1, f1, xm+0.1, f2, code = 3, col = "red", angle = 75, length = .1) } } # routines for quantization and one-dimensional clustering # Tom Minka 1/3/02 # reduce data to n points, by local averaging # use quantiles so size of each bin is equal squash <- function(x,n=1000) { b <- floor(length(x)/n) rb <- rep(b,n) # spread the overflow broadly over the range overflow <- length(x) - n*b rb <- rb + c(0,diff(trunc(overflow/n*(1:n)))) # f is 1 1 1 2 2 2 3 3 3 ... f <- rep(1:n, rb) f <- f[1:length(x)] x <- sort(x) tapply(x,f,mean) } plot.breaks <- function(b) { r <- par("usr")[3:4] segments(b,rep(r[1],length(b)), b,rep(r[2],length(b)), col="blue", lwd=2) } hist.hclust <- function(hc,x,k=2:5) { hb <- break.equal(x,40) h <- hist(x,hb,freq=FALSE,col="bisque",main="",xlab="") top <- par("usr")[4] # for cutree library(mva) for(i in 1:length(k)) { q <- cutree(hc,k[i]) b <- break.cut(x,q) scale <- (length(k)-i+1)/length(k) segments(b,rep(0,length(b)), b,rep(top*scale,length(b)), col="blue",lwd=3) } } # evenly-spaced breaks break.equal <- function(x,n=2) { r <- range(x,na.rm=T) seq(0,1,by=1/n)*diff(r) + r[1] } # this is "equal-frequency" binning break.quantile <- function(x,n=2,plot=F,pretty=F) { probs = seq(0,1,len=n+1) b <- as.numeric(quantile(unique(x), probs, na.rm=T)) if(pretty) { for(i in 1:length(b)) { for(d in 1:4) { b.new = signif(b[i],d) p.new = mean(x <= b.new, na.rm=T) if(probs[i] == 0 || probs[i] == 1) { ok = (p.new == probs[i]) } else { ok = (abs(p.new - probs[i]) < 0.1/n) } if(ok) { b[i] = b.new break } } } } if(plot) { hb <- break.equal(x,40) h <- hist(x,hb,freq=FALSE,col="bisque",main="",xlab="") plot.breaks(b) } b } # convert a vector of cluster centers into a break vector break.centers <- function(x,m) { m <- sort(m) n <- length(m) b <- (m[1:n-1]+m[2:n])/2 r <- range(x) c(r[1]-eps,b,r[2]) } break.kmeans <- function(x,n=2,plot=T) { library(mva) km <- kmeans(x,n) m <- as.numeric(km$centers) ss <- sum(km$withinss) if(plot) cat("sum of squares =", format(ss), "\n") b <- break.centers(x,m) if(plot) { hb <- break.equal(x,40) h <- hist(x,hb,freq=FALSE,col="bisque",main="",xlab="") plot.breaks(b) } b } # divide the range of x into n pieces, based on the largest spaces # returns a vector of breaks, suitable for input to "hist" or "cut" # tends to overdivide low-density regions break.diff <- function(x,n=2) { n <- n-1 len <- length(x) x <- sort(x) diff <- x[2:len] - x[1:len-1] i <- order(-diff) mid <- (x[2:len] + x[1:len-1])/2 b <- sort(mid[i[1:n]]) split.screen(c(2,1)) split.screen(c(1,2),screen=1) screen(4) par(mar=c(2,4,1,1)) last <- n+10 plot(diff[i[1:last]],ylab="diff") m <- (diff[i[n]] + diff[i[n+1]])/2 #arrows(1,m, last,m, col="black",code=0,lty=2) segments(n+0.5, diff[i[last]], n+0.5, diff[i[1]], col="black",lty=2) screen(2) par(mar=c(2,4,1,1)) plot(x) segments(0,b, len+1,b, col="blue",lty=2) screen(3) par(mar=c(2,4,1,1)) hist(x,30,col="bisque",main="") segments(b,rep(0,n), b,rep(100,n), col="blue") close.screen(all = TRUE) c(x[1]-eps,b,x[len]) } # convert a cut of x into breaks # f is a factor which divides x into disjoint groups # this is only possible for certain kinds of cuts break.cut <- function(x,f) { low <- sort(as.numeric(tapply(x,f,min))) high <- sort(as.numeric(tapply(x,f,max))) n <- length(high) b <- (low[2:n]+high[1:n-1])/2 c(low[1]-eps,b,high[n]) } scatter <- function(x) { if(is.null(dim(x))) sum((x - mean(x))^2) else sum((x - rep.row(mean(x),nrow(x)))^2) } sum.of.squares <- function(x,q) { n <- as.vector(table(q)) nx <- as.data.frame(x) nx <- nx[sapply(nx,is.numeric)] sum(sapply(split(nx,q),scatter)) } # Create a hierarchy by Ward's method ward <- function(x,n=rep(1,length(x)),s=rep(1,length(x)), sortx=TRUE,same.var=T) { if(is.list(x)) { n <- sapply(x,length) s <- sapply(x,scatter) #if(!same.var) s <- s + 0.1 x <- sapply(x,mean) } # don't sort and it becomes time-series clustering if(sortx) { ord <- order(x) x <- x[ord] n <- n[ord] s <- s[ord] } else { ord <- 1:length(x) } height <- c() label <- -(1:length(x)) label <- label[ord] merge <- matrix(0,length(x)-1,2) if(length(x) == 1) ks <- c() else ks <- 1:(length(x)-1) for(k in ks) { len <- length(x) # here we exploit the fact that the data is one-dimensional. # only clusters which are adjacent in sorted order need be considered # for merging. x1 <- x[1:len-1] x2 <- x[2:len] n1 <- n[1:len-1] n2 <- n[2:len] # merging cost cost <- n1*n2/(n1+n2)*(x1-x2)^2 if(!same.var) { s1 <- s[1:len-1] s2 <- s[2:len] cost <- (n1+n2)*log((s1+s2+cost)/(n1+n2)) - n1*log(s1/n1) - n2*log(s2/n2) } j <- which.min(cost) if(!same.var) { s[j] <- s[j] + s[j+1] + n[j]*n[j+1]/(n[j]+n[j+1])*(x[j]-x[j+1])^2 s <- not(s,j+1) } x[j] <- (x[j]*n[j] + x[j+1]*n[j+1])/(n[j] + n[j+1]) n[j] <- n[j] + n[j+1] x <- not(x,j+1) n <- not(n,j+1) height[k] <- min(cost) merge[k,1] <- label[j] merge[k,2] <- label[j+1] label[j] <- k label <- not(label,j+1) } hc <- list(merge=merge,height=cumsum(height),order=ord,labels=NULL, method="ward") class(hc) <- "hclust" hc } plot.hclust.trace <- function(h,k=1:10) { g <- c(rev(h$height),0) k <- k[k < length(g)] #g <- sqrt(g) if(h$method != "single") { g <- -diff(g) } plot(k,g[k],ylab="merging cost",xlab="clusters") } break.ward <- function(x,n=2,plot=T) { h <- ward(x) library(mva) q <- cutree(h,n) ss <- sum(tapply(x,q,scatter)) if(plot) cat("sum of squares =", format(ss), "\n") b <- break.cut(x,q) if(plot) { split.screen(c(2,1)) plot.hclust.trace(h) screen(2) hist.hclust(h,x,k=2:n) close.screen(all=TRUE) } b } # "ward" is best # "ave","centroid" are good # "single", "complete","median","mcq" are bad break.hclust <- function(x,n=2,method="ward",plot=T) { library(mva) h <- hclust(dist(x)^2,method) q <- cutree(h,n) ss <- sum(tapply(x,q,scatter)) cat("sum of squares =", format(ss), "\n") b <- break.cut(x,q) if(plot) { split.screen(c(2,1)) plot.hclust.trace(h) screen(2) hist.hclust(h,x,k=2:n) close.screen(all=TRUE) } b } name.clusters <- function(f,y) { # given cluster memberships f and a factor y, returns the majority level # in each cluster levels(y)[apply(table(f,y),1,which.max)] } ############################################################################# # time series changepoints make.ar <- function(n,a,e,start) { k = length(a) if(missing(start)) { x = e*rnorm(k)/rnorm(k) } else { x = start } for(i in (k+1):n) { x[i] = sum(a*x[(i-1):(i-k)]) + e*rnorm(1)/rnorm(1) } x } lag = function(x,k=1) { c(rep(NA,k),x[1:(length(x)-k)]) } # convert a cut of x into a changepoint plot plot.segments.ts <- function(x,b,lwd=2,...) { plot(x,type="l",xlab="time",ylab="level") f <- cut(time(x),b,include.lowest=T) m <- tapply(as.numeric(x),f,mean) low <- tapply(time(x),f,min) high <- tapply(time(x),f,max) segments(low, m, high, m, col="blue",lwd=lwd,...) } # Apply Ward's method to find changepoints in a time-series break.ts <- function(x,n=2,trace=T,same.var=T,...) { h <- ward(as.numeric(x),sortx=F,same.var=same.var) library(mva) q <- cutree(h,n) b <- break.cut(time(x),q) #b <- c(0,which(diff(q)>0),length(x)+1) ss <- sum(tapply(x,q,scatter)) cat("sum of squares =", format(ss), "\n") if(length(h$height) < 2) trace <- F if(trace) { split.screen(c(2,1)) plot.hclust.trace(h) screen(2) } plot.segments.ts(x,b,...) #points(x) if(trace) close.screen(all=TRUE) b } ############################################################################# boxplot.hclust <- function(hc,x,k=2:5,col="bisque",...) { x <- x[hc$order] boxplot(x,col=col,...) library(mva) r <- range(x) opar <- par(lwd=3) on.exit(par(opar)) for(i in 1:length(k)) { if(k[i] > length(hc$height)) q <- 1:(length(hc$height)+1) else q <- cutree(hc,k[i])[hc$order] b <- which(diff(q)!=0)+0.5 b <- c(0,b,length(x)+1) scale <- (length(k)-i+1)/length(k) segments(b,rep(r[1],length(b)), b,rep(r[2]*scale,length(b)), col="blue") } } # breaks a factor into nbins bins in order to preserve the prediction of x merge.factor <- function(f,x,n,same.var=T,trace=T,xlab=NA,ylab=NA) { library(mva) if(is.na(xlab)) xlab <- deparse(substitute(f)) if(is.na(ylab)) ylab <- deparse(substitute(x)) ord <- is.ordered(f) if(!ord) f <- sort.levels(f,x,fun=mean) # drop unused categories f <- factor(f) h <- ward(split(x,f),sortx=!ord,same.var=same.var) if(n > length(h$height)) q <- 1:(length(h$height)+1) else q <- cutree(h,n) nam <- tapply(levels(f),q,function(x) merge.names(x,ordered=ord)) of <- f levels(f) <- nam[q] if(same.var) { ss <- sum(tapply(x,f,scatter)) cat("sum of squares =", format(ss), "\n") } else { n <- tapply(x,f,length) ss <- sum(n*log(tapply(x,f,scatter)/n)) cat("sum of log-variance =", format(ss), "\n") } if(trace) { split.screen(c(2,1)) on.exit(close.screen(all=TRUE)) plot.hclust.trace(h) screen(2) } boxplot.hclust(h,split(x,of),xlab=xlab,ylab=ylab) f } # Routines for manipulating colors and making color plots # Tom Minka # intended for white background gray.colors <- function(n) gray(rev(0:(n-1))/n) default.colors <- function(n) ((0:(n-1))%%6+1) # good for objects which are being outlined (e.g. pies) default.colors.w <- function(n) c(8,seq(len=n-1)) # these are from # http://www.personal.psu.edu/faculty/c/a/cab38/ColorBrewerBeta2.html OrRd.colors <- function(n) { if(n == 7) c("#fef0d9","#fdd49e","#fdbb84","#fc8d59","#ef6548","#d7301f","#99000d") else if(n == 6) c("#fef0d9","#fdd49e","#fdbb84","#fc8d59","#e34a33","#b2000d") else if(n == 5) c("#fef0d9","#fdcc8a","#fc8d59","#e34a33","#b2000d") else if(n == 4) c("#fef0d9","#fdcc8a","#fc8d59","#d7301f") else if(n == 3) c("#fee8c8","#fdbb84","#e34a33") else if(n == 2) c("#fdbb84","#e34a33") else stop("need a different number of levels") } YlGnBu.colors <- function(n) { if(n == 8) c("#ffffd9","#edf8b0","#c7e9b4","#7fcdbb","#41b6c3","#1d91c0","#225ea8","#0c2c83") else if(n == 7) c("#ffffcc","#c7e9b4","#7fcdbb","#41b6c3","#1d91c0","#386cb0","#0c2c83") else if(n == 6) c("#ffffcc","#c7e9b4","#7fcdbb","#41b6c3","#2c7fb8","#253494") else if(n == 5) c("#ffffcc","#a1dab4","#41b6c3","#2c7fb8","#253494") else if(n == 4) c("#ffffcc","#a1dab4","#41b6c3","#386cb0") else if(n == 3) c("#edf8b0","#7fcdbb","#2c7fb8") else if(n == 2) c("yellow","blue") else stop("need a different number of levels") } # diverging BrBg.colors <- function(n) { if(n == 7) c("#8c510a","#d8b365","#f6e8c3","#f5f5f5","#c7eae5","#5ab4ac","#1665e") else if(n == 6) c("#8c510a","#d8b365","#f6e8c3","#c7eae5","#5ab4ac","#1665e") else if(n == 5) c("#a6611a","#dfc27d","#f5f5f5","#80cdc1","#18571") else if(n == 4) c("#a6611a","#dfc27d","#80cdc1","#18571") else if(n == 3) c("#d8b365","#f5f5f5","#5ab4ac") else if(n == 2) c("#d8b365","#5ab4ac") else stop("need a different number of levels") } RYB.colors <- function(n) { if(n == 7) c("#d73d29","#fc8d59","#fee090","#ffffbf","#e0f3f8","#91bfdb","#4575b4") else if(n == 6) c("#d73d29","#fc8d59","#fee090","#e0f3f8","#91bfdb","#4575b4") else if(n == 5) c("#ca373b","#fdae61","#ffffbf","#abd9e9","#2c7bb6") else if(n == 4) c("#ca373b","#fdae61","#abd9e9","#2c7bb6") else if(n == 3) c("#fc8d59","#ffffbf","#91bfdb") else if(n == 2) c("red","blue") else stop("need a different number of levels") } # sequential WYRK YR.colors <- function(k,lt=0.97,dk=0.03) { r <- c(lt, rep(lt,k), rep(lt,k), seq(lt,dk,len=k+1)[-1]) g <- c(lt, rep(lt,k), seq(lt,dk,len=k+1)[-1], rep(dk,k)) b <- c(lt, seq(lt,dk,len=k+1)[-1], rep(dk,k), rep(dk,k)) i <- seq(1,length(r),len=k) rgb(r[i],g[i],b[i]) } # diverging diverging.colors <- function(n,hmax=0,hmin=(0.5+hmax)%%1,v=1) { if(n %% 2 == 1) { # odd case k <- n%/%2 x <- seq(1,0,len=k+1) c(hsv(h = hmin, s = x, v), hsv(h = hmax, s = rev(x[1:k]), v)) } else { # even case k <- n%/%2 x <- seq(1,-1,len=n)[1:k] c(hsv(h = hmin, s = x, v), hsv(h = hmax, s = rev(x), v)) } } RC.colors <- diverging.colors GM.colors <- function(n) diverging.colors(n,2/6) # returns a new vector of darker colors darker <- function(col,scale=0.8) { crgb <- col2rgb(col)/255*scale if(T) { # handle black specially i <- which(apply(crgb,2,sum)==0) if(length(i)>0) { crgb[,i] <- rep.col(c(1,1,1)/10,length(i)) } } rgb(crgb[1,],crgb[2,],crgb[3,]) } # note: pch 21 looks like pch 1, but is actually a filled circle #plot(y,pch=21,bg=hsv(6/12,1,1)) color.key <- function(col,pch,names,breaks,digits=2,cex=0.75) { if(is.null(col)) col <- rep(1,length(pch)) rx <- range(par("usr")[1:2]) ry <- range(par("usr")[3:4]) if(T && !missing(pch) && !is.null(pch)) { # key is par("csi")*cex inches ry[1] <- ry[2] - diff(ry)/par("pin")[2]*par("csi")*cex } else { # key is 0.06 inches high ry[1] <- ry[2] - diff(ry)/par("pin")[2]*0.06 } if(!missing(names)) n <- length(names) else if(!missing(breaks)) n <- length(breaks)-1 else n <- length(col) i <- seq(0,1,len=n+1) x1 <- rx[1]+i[-length(i)]*diff(rx) x2 <- rx[1]+i[-1]*diff(rx) if(n == 0) stop("n == 0") col <- rep(col,ceiling(n/length(col)))[1:n] rect(x1, ry[1], x2, ry[2], col=col, border=NA) if(!missing(pch) && !is.null(pch)) points((x1+x2)/2, rep(mean(ry),n), col=8, pch=pch, cex=0.75) if(!missing(names)) mtext(names,at=(x1+x2)/2,cex=cex) if(!missing(breaks)) { x <- rx[1] + i*diff(rx) names <- prettyNum(formatC(breaks,format="fg",digits=digits),big.mark=",") #names = format(breaks,digits=digits,trim=T) mtext(names[1],at=x[1],cex=cex,adj=0) mtext(names[n+1],at=x[n+1],cex=cex,adj=1) mtext(names[2:n],at=x[2:n],cex=cex,adj=0.5) } } # extends r=c(min,max) by 4%, to mimic plotting limits extend.pct <- function(r,pct=0.04) { m <- diff(r)*pct if(m == 0) m <- 0.432*abs(r[1]) if(m == 0) m <- 1.08 c(r[1]-m,r[2]+m) } extend.inches <- function(r,upper=0,lower=0) { # extends the given range by a specified number of inches of each side. # upper and lower are actually inches/plot.size # constraints: # (r.new[2] - r[2])/diff(r.new) = upper # (r[1] - r.new[1])/diff(r.new) = lower r.new <- r a = lower/(1-upper) r.new[1] <- (r[1] + a*r[2])/(1-a) r.new[2] <- (r[2] - upper*r.new[1])/(1-upper) r.new } # returns the minimum user limits for a plotting region of size pin # which would enclose the given labels at the given positions # with the justification adj at rotation srt. range.text <- function(x,y,labels,cex=NULL,adj=NULL,srt=0, pin=par("pin")) { lab.w <- strwidth(labels,"inches",cex=cex) lab.h <- strheight(labels,"inches",cex=cex) # bounding box of text, columns are (x1,y1)(x2,y1)(x2,y2)(x1,y2) n <- length(labels) ix <- seq(1,8,by=2) iy <- seq(2,8,by=2) hull <- array(0,c(n,8)) hull[,c(1,7,2,4)] <- rep(0,n) hull[,c(3,5,6,8)] <- rep(1,n) # put adj point at origin and scale if(is.null(adj)) adj <- c(0.5,0.5) if(length(adj) == 1) adj <- c(adj,0.5) hull[,ix] <- (hull[,ix] - adj[1])*rep.col(lab.w,4) hull[,iy] <- (hull[,iy] - adj[2])*rep.col(lab.h,4) # rotate srt <- srt/180*pi cr <- cos(srt) sr <- sin(srt) R <- array(c(cr,-sr,sr,cr),c(2,2)) R <- diag(4) %x% R hull <- hull %*% R bbox <- list() bbox$left <- apply(hull[,ix],1,min)/pin[1] bbox$right <- apply(hull[,ix],1,max)/pin[1] bbox$bottom <- apply(hull[,iy],1,min)/pin[2] bbox$top <- apply(hull[,iy],1,max)/pin[2] # solve constraints xmid <- mean(range(x)) ymid <- mean(range(y)) xlim <- c() # these come from the constraints # (x-xmin)/(xmax-xmin) + left > 0 # (x-xmin)/(xmax-xmin) + right < 1 # where xmax is fixed at 2*xmid - xmin min1 <- min((x + 2*bbox$left*xmid)/(1+2*bbox$left)) min2 <- min((2*(1-bbox$right)*xmid - x)/(1-2*bbox$right)) xlim[1] <- min(min1,min2) xlim[2] <- 2*xmid - xlim[1] ylim <- c() min1 <- min((y + 2*bbox$bottom*ymid)/(1+2*bbox$bottom)) min2 <- min((2*(1-bbox$top)*ymid - y)/(1-2*bbox$top)) ylim[1] <- min(min1,min2) ylim[2] <- 2*ymid - ylim[1] c(xlim,ylim) } range.aspect <- function(asp,lim=par("usr"),pin=par("pin")) { # returns c(xlim,ylim), strictly bigger than lim, which satisfy asp # simulates the effect of plot.window(asp) xlim <- lim[1:2] ylim <- lim[3:4] dx <- diff(xlim)/pin[1] dy <- diff(ylim)/pin[2] if(dy > asp*dx) { dx <- dy/asp*pin[1] xlim <- mean(xlim) + c(-dx/2,dx/2) } else { dy <- asp*dx*pin[2] ylim <- mean(ylim) + c(-dy/2,dy/2) } c(xlim,ylim) } default.pch <- c(1,41,4,20,35,38,2,17) sequential.pch <- c(20,41,4,1,38,35,11,19) sequential.pch.paper <- c(45,41,4,1,38,35,11,19) color.plot <- function(object, ...) UseMethod("color.plot") color.plot.formula <- function(formula,data=parent.frame(),...) { x <- model.frame.default(formula,data,na.action=na.pass) color.plot.data.frame(x,...) } color.plot.data.frame <- function(x,z,zlab,xlab=NULL,ylab=NULL,labels=F,...) { if(missing(z)) { pred <- predictor.vars(x) resp <- response.var(x) z <- x[[resp]] if(missing(zlab)) zlab <- resp } else { pred <- colnames(x) if(missing(zlab)) zlab <- deparse(substitute(z)) } if(length(pred) < 2) stop("must have two predictors to plot") if(is.logical(labels)) { labels <- if(labels) rownames(x) else NULL } if(is.null(xlab)) xlab <- pred[1] if(is.null(ylab)) ylab <- pred[2] color.plot.default(x[[pred[1]]],x[[pred[2]]],z,labels=labels, xlab=xlab,ylab=ylab,zlab=zlab,...) } color.plot.default <- function(x,y,z,labels=NULL,data=parent.frame(), xlab,ylab,zlab, xlim=NULL,ylim=NULL, axes=T,key=T,add=F,nlevels=4,pretty=T, color.palette=NULL,col, pch.palette=NULL,pch, jitter=F,digits=2,mar,bg=NULL, cex=par("cex"),yaxt="s",...) { # continous responses are automatically quantized into nlevels levels, with # colors given by col. # jitter is only a suggestion. jittering is only done when necessary. if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) # treat character as factor if(mode(x) == "character") x <- factor(x,levels=unique(x)) if(mode(y) == "character") y <- factor(y,levels=unique(x)) if(is.numeric(z)) { b <- break.quantile(z,nlevels,pretty=pretty) f <- rcut(z,b) if(!any(is.na(z)) && any(is.na(f))) stop("internal error: cut introduced NAs") if(is.null(color.palette)) { color.palette = YlGnBu.colors #color.palette = YR.colors if(is.null(bg)) bg = grey(0.5) } if(is.null(pch.palette)) { pch.palette = if(mode(color.palette) == "function") 16 else sequential.pch } } else { f <- factor(z) if(is.null(color.palette)) color.palette = default.colors if(is.null(pch.palette)) { pch.palette = if(mode(color.palette) == "function") 16 else default.pch } } nlevels <- length(levels(f)) if(nlevels == 0) stop("0 levels in factor") # lev.col is the color of each level # lev.pch is the symbol of each level # if there are too few colors, recycle them and change pch lev <- 1:nlevels # col is the color of the individual points q <- as.numeric(f) if(missing(col)) { if(mode(color.palette) == "function") color.palette <- color.palette(nlevels) lev.col <- color.palette[((lev-1) %% length(color.palette))+1] col <- lev.col[q] } else { lev.col <- NULL } if(missing(pch)) { lev.pch <- pch.palette[(((lev-1) %/% length(color.palette)) %% length(pch.palette))+1] pch <- lev.pch[q] } else { lev.pch <- NULL } # setup plot window if(!add) { if(missing(mar)) { # mar is c(bottom, left, top, right) if(axes) { mar = auto.mar() if(yaxt=="n") mar[2] = 0 mar[3] = if(key) 2 else 1 } else { mar <- c(0.05,0,if(key) 1 else 0,0.05) } } # change mar permanently so we can add things on top par(mar=mar) if(!is.null(bg) && bg != par("bg")) { opar <- par(bg=bg) on.exit(par(opar)) } } # xlim,ylim needed for jittering and label placement if(add) { xlim <- par("usr")[1:2] ylim <- par("usr")[3:4] } else { if(is.null(labels)) { if(is.null(xlim)) xlim <- range(as.numeric(x),na.rm=T) if(is.null(ylim)) ylim <- range(as.numeric(y),na.rm=T) if(key) { # make room at the top of the plot for the key if(length(unique(lev.pch)) == 1) { a = 0.06/par("pin")[2] } else { a = par("csi")*cex/par("pin")[2] } ylim = extend.inches(ylim,a) } } else { plot.new() lab.lim <- range.text(x,y,labels,cex) # make room for labels if(is.null(xlim)) xlim <- lab.lim[1:2] if(is.null(ylim)) ylim <- lab.lim[3:4] } } if(jitter) { # this test needs to be less strict. need to consider actual figure size. if(length(levels(factor(x))) < length(x)) { cat("jittering",xlab,"\n") jitter1 <- (runif(length(x))-0.5)*diff(extend.pct(xlim))/100 x <- x+jitter1 } if(length(levels(factor(y))) < length(y)) { cat("jittering",ylab,"\n") jitter2 <- (runif(length(y))-0.5)*diff(extend.pct(ylim))/100 y <- y+jitter2 } } if(add) { if(!is.null(labels)) { return(text(x, y, labels=labels, col=col, cex=cex, ...)) } else { return(points(x, y, pch=pch, col=col, cex=cex, ...)) } } # from here on, add=F if(!is.null(labels)) { plot.default(x, y, xlim=xlim, ylim=ylim, type="n", xlab=xlab,ylab=ylab,main="",axes=axes,yaxt=yaxt,...) text(x, y, labels=labels, col=col, cex=cex, ...) } else { plot.default(x, y, xlim=xlim, ylim=ylim, xlab=xlab,ylab=ylab,main="", pch=pch,col=col,axes=axes,yaxt=yaxt,cex=cex,...) } if(key) { # don't draw symbols in the key if there is only one type if(length(unique(lev.pch)) == 1) lev.pch <- NULL if(is.numeric(z)) color.key(lev.col,lev.pch,breaks=b,digits=digits) else color.key(lev.col,lev.pch,levels(f)) if(axes) title(zlab,line=1) } else if(axes) title(zlab) } # tests: # color.plot(runif(10),runif(10),c(rep("a",5),rep("b",5))) # color.plot(runif(10),runif(10),c(rep("a",5),rep("b",5)),rep("foo",10)) # this part same as color.plot text.plot <- function(object,...) UseMethod("text.plot") text.plot.formula <- function(formula,data=parent.frame(),...) { x <- model.frame.default(formula,data,na.action=na.pass) text.plot.data.frame(x,...) } text.plot.data.frame <- function(x,labels,xlab,ylab,...) { pred <- predictor.vars(x)[1] resp <- response.var(x) x1 <- x[[pred]] x2 <- x[[resp]] if(missing(labels)) labels <- rownames(x) if(missing(xlab)) xlab = pred if(missing(ylab)) ylab = resp text.plot.default(x1,x2,labels,xlab=xlab,ylab=ylab,...) } text.plot.default <- function(x,y,labels,xlab,ylab,xlim=NULL,ylim=NULL, main="",asp=NULL, cex=par("cex"),adj=NULL,srt=0,axes=T,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) plot.new() lim <- range.text(x,y,labels,cex,adj,srt) # make room for labels if(is.null(xlim)) xlim <- lim[1:2] if(is.null(ylim)) ylim <- lim[3:4] if(F) { plot(x,y,xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,axes=axes, main=main,...,type="n") } else { plot.window(xlim,ylim,asp=asp) if(axes) { axis(1); axis(2); box() } title(main,xlab=xlab,ylab=ylab) } text(x,y,labels=labels,cex=cex,adj=adj,srt=srt,...) } # test: text.plot(runif(10),runif(10),rep("a",10)) # Plot a regression surface. # # clip specifies a polygon in (x,y) over which the surface is to be defined. # possible values are F (no clipping), T (clip to the convex hull of the # data), a list(x,y), or a matrix with two columns. # # if fill=T, z values outside zlim will not be plotted, # i.e. the corresponding regions will be white. # the default zlim is the range of the data. # color.plot.loess <- function(object,x,res=50,fill=F,add=fill,equal=F, drawlabels=F, levels=NULL,nlevels=if(is.null(levels)) 4 else length(levels), col, clip=T, zlim, bg=grey(0.5), lwd=2, color.palette=YlGnBu.colors,...) { if(missing(x)) x <- model.frame(object) else x <- model.frame(terms(object),x) pred <- predictor.vars(object) if(length(pred) == 1) return(plot.loess(object,x,add=add,lwd=lwd,...)) if(mode(color.palette) == "function") { color.palette <- color.palette(nlevels) } if(!add && !fill) color.plot.data.frame(x,levels=levels,nlevels=nlevels, color.palette=color.palette,bg=bg,...) # use the par options set by color.plot.data.frame # x is only used to get plotting range resp <- response.var(object) pred <- predictor.vars(object) # this is ugly, but otherwise recursively calling color.plot.data.frame is hard if(is.null(...$xlim)) { if(add || !fill) xlim <- range(par("usr")[1:2]) else xlim <- range(x[[pred[1]]]) } else xlim <- ...$xlim x1 <- seq(xlim[1],xlim[2],length=res) if(is.null(...$ylim)) { if(add || !fill) ylim <- range(par("usr")[3:4]) else ylim <- range(x[[pred[2]]]) } else ylim <- ...$ylim x2 <- seq(ylim[1],ylim[2],length=res) xt <- expand.grid(x1,x2) names(xt) <- pred[1:2] z <- predict(object,xt) if(clip != F && length(pred) >= 2) { if(clip == T) { i <- chull(x[pred]) clip <- x[pred][i,] } z[!in.polygon(clip,xt)] <- NA } if(missing(zlim)) { zlim <- range(x[[resp]]) z[z < zlim[1]] <- zlim[1] z[z > zlim[2]] <- zlim[2] } dim(z) <- c(length(x1),length(x2)) if(is.null(levels)) { if(equal) levels <- seq(zlim[1],zlim[2],len=nlevels+1) else { if(T) { r <- unique(x[[resp]]) #if(!missing(zlim)) { r <- r[r >= zlim[1] & r <= zlim[2]] r <- c(r,zlim) #} } levels <- break.quantile(r,nlevels,pretty=T) } } if(fill) { filled.contour(x1,x2,z,levels=levels, col=color.palette,xlab=pred[1],ylab=pred[2],main=resp,...) if(!add) color.plot.data.frame(x,col=1,add=T,...) } else { color.palette <- c(color.palette,color.palette[length(color.palette)]) contour(x1,x2,z,add=T,col=color.palette,levels=levels,drawlabels=drawlabels,lwd=lwd,...) } } color.plot.glm <- function(object,data,jitter=T,add=F,se=F,col=3,lwd=2,...) { if(missing(data)) data = model.frame(object) if(!add) color.plot.data.frame(data) # use the par options set by color.plot.data.frame w <- coef(object) z <- log(0.75/0.25) if(length(w) == 3) { abline(-w[1]/w[3],-w[2]/w[3],col=col,lwd=lwd,...) if(se) { abline((-z-w[1])/w[3],-w[2]/w[3],col=col,lty=2,lwd=lwd,...) abline((z-w[1])/w[3],-w[2]/w[3],col=col,lty=2,lwd=lwd,...) } } else { stop("use color.plot.knn") # two predictors with an interaction term # boundary is an ellipse xlim <- par("usr")[1:2] x <- seq(xlim[1],xlim[2],length=50) y <- (0-w[1]-w[2]*x)/(w[3]+w[4]*x) i <- (w[3]+w[4]*x < 0) lines(x[i],y[i],col=col,...) lines(x[!i],y[!i],col=col,...) if(se) { for(q in c(-z,z)) { y <- (q-w[1]-w[2]*x)/(w[3]+w[4]*x) lines(x[i],y[i],col=col,lty=2,...) lines(x[!i],y[!i],col=col,lty=2,...) } } } } color.plot.multinom <- color.plot.glm # Plot classification boundaries. # If pclass is given, the probability of being in that class is plotted. # # levels and col are only used if pclass != NULL # color.plot.knn <- function(object,x,pclass=NULL,fill=F,res=50, levels=c(0.5),add=F,axes=T,lwd=2, color.palette=default.colors,col=3,drawlabels=F,...) { resp <- response.var(object) pred <- predictor.vars(object) if(missing(x)) x <- model.frame(object) else if(length(pred) == 1) { # take an unused variable from x pred[2] = setdiff(colnames(x),c(resp,pred))[1] fmla = formula(paste(resp,"~",paste(pred,collapse="+"))) x = model.frame(fmla,x) } else x <- model.frame(terms(object),x) if(!add && !fill) color.plot.data.frame(x,color.palette=color.palette,...) # use the par options set by color.plot.data.frame # x is only used to get plotting range # this is ugly, but otherwise recursively calling color.plot.data.frame is hard if(is.null(...$xlim)) { if(add || !fill) xlim <- range(par("usr")[1:2]) else xlim <- range(x[[pred[1]]]) } else xlim <- ...$xlim x1 <- seq(xlim[1],xlim[2],length=res) # allow > 2 for derived features if(length(pred) >= 2) { if(is.null(...$ylim)) { if(add || !fill) ylim <- range(par("usr")[3:4]) else ylim <- range(x[[pred[2]]]) } else ylim <- ...$ylim x2 <- seq(ylim[1],ylim[2],length=res) xt <- expand.grid(x1,x2) names(xt) <- pred[1:2] } else { xt <- data.frame(x1) } prob.plot <- !is.null(pclass) if(prob.plot) { # z is the probability of a class if(inherits(object,"glm") || inherits(object,"gam") || inherits(object,"nnet")) { if(inherits(object,"glm") || inherits(object,"gam")) { z <- predict(object,xt,type="response") } else { z = predict(object,xt) } y <- model.frame(object)[[resp]] if(is.character(pclass)) pclass <- pmatch(pclass, levels(y)) if(pclass == 1) { z <- 1-z lev <- levels(y)[1] main <- paste("Pr(",resp," = ",lev,")",sep="") } else { if(pclass > length(levels(y))) stop("pclass exceeds the number of classes") if(length(levels(y)) == 2) { lev <- levels(y)[2] main <- paste("Pr(",resp," = ",lev,")",sep="") } else { lev <- levels(y)[1] main <- paste("Pr(",resp," != ",lev,")",sep="") } } } else { z <- predict(object,xt,type="vector") if(is.character(pclass)) pclass <- pmatch(pclass, colnames(z)) lev <- colnames(z)[pclass] main <- paste("Pr(",resp," = ",lev,")",sep="") z <- z[,pclass] } } else { # z is the class if(inherits(object,"glm") || inherits(object,"gam")) { p <- predict(object,xt,type="response") y <- model.frame(object)[[resp]] z <- factor(levels(y)[as.numeric(p > 0.5)+1], levels=levels(y)) } else { z <- factor(predict(object,xt,type="class")) } } dim(z) <- c(length(x1),length(x2)) if(fill) { if(prob.plot) { filled.contour(x1,x2,z, plot.axes={ title(main=main,xlab=pred[1],ylab=pred[2]);axis(1);axis(2); if(!add) color.plot.data.frame(x,col=1,add=T,...) },...) } else { lev <- levels(z) if(is.function(color.palette)) col <- color.palette(length(lev)) else col <- color.palette actual.lev <- levels(factor(z)) icol <- col[lev %in% actual.lev] z <- as.numeric(z) dim(z) <- c(length(x1),length(x2)) colorbar <- (length(col) > 1) if(!add) { # change mar permanently so we can add things on top if(axes) { mar = auto.mar() mar[3] = if(colorbar) 2 else 1 par(mar=mar) } else { par(mar=c(0.05,0,0,0.1)) } image(x1,x2,z,col=icol,main="",xlab=pred[1],ylab=pred[2]) # assume a light color scheme (else should be lighter) dcol <- darker(col) color.plot.data.frame(x,add=T,col=dcol,...) } else image(x1,x2,z,col=icol,add=T) if(colorbar) { color.key(col,lev) if(!add) title(resp,line=1) } else if(!add) title(resp,line=0) } } else { # not filled if(prob.plot) { cat("Plotting contour for",main,"=",levels,"\n") contour(x1,x2,z,add=T, levels=levels,col=col,lwd=lwd, drawlabels=drawlabels,...) } else { nlevels <- length(levels(z)) z <- as.numeric(z) dim(z) <- c(length(x1),length(x2)) levels <- 0.5+(1:(nlevels-1)) contour(x1,x2,z,add=T,levels=levels,col=col,lwd=lwd, drawlabels=drawlabels,...) } } } ############################################################################## # bug fix # minka: added `key' and `main' arguments filled.contour <- function (x = seq(0, 1, len = nrow(z)), y = seq(0, 1, len = ncol(z)), z, xlim = range(x, finite=TRUE), ylim = range(y, finite=TRUE), zlim = range(z, finite=TRUE), levels = pretty(zlim, nlevels), nlevels = 20, color.palette = cm.colors, col = color.palette(length(levels) - 1), plot.title, plot.axes, key.title, key.axes, key=T, asp = NA, xaxs="i", yaxs="i", las = 1, axes = TRUE, main="", ...) { if (missing(z)) { if (!missing(x)) { if (is.list(x)) { z <- x$z y <- x$y x <- x$x } else { z <- x x <- seq(0, 1, len = nrow(z)) } } else stop("no `z' matrix specified") } else if (is.list(x)) { y <- x$y x <- x$x } if (any(diff(x) <= 0) || any(diff(y) <= 0)) stop("increasing x and y values expected") if(key) { mar.orig <- (par.orig <- par(c("mar","las","mfrow")))$mar on.exit(par(par.orig)) par(las = las) w <- (3 + mar.orig[2]) * par('csi') * 2.54 layout(matrix(c(2, 1), nc=2), widths=c(1, lcm(w))) ## Plot the `plot key' (scale): mar <- mar.orig mar[4] <- mar[2] mar[2] <- 1 par(mar = mar) plot.new() plot.window(xlim=c(0,1), ylim=range(levels), xaxs="i", yaxs="i") rect(0, levels[-length(levels)], 1, levels[-1], col = col) if (missing(key.axes)) { if (axes) axis(4) } else key.axes box() if (!missing(key.title)) key.title } ## Plot contour-image:: if(key) { mar <- mar.orig mar[4] <- 1 par(mar=mar) } plot.new() plot.window(xlim, ylim, "", xaxs=xaxs, yaxs=yaxs, asp=asp) if (!is.matrix(z) || nrow(z) <= 1 || ncol(z) <= 1) stop("no proper `z' matrix specified") if (!is.double(z)) storage.mode(z) <- "double" .Internal(filledcontour(as.double(x), as.double(y), z, as.double(levels), col = col)) if (axes) { if (missing(plot.axes)) { # minka: no title here axis(1) axis(2) } else plot.axes if (missing(plot.title)) title(main=main,...) else plot.title } box() invisible() } ############################################################################## rgb2name <- function(r,g,b) { if(missing(g)) { r <- as.matrix(r) if(ncol(r) != 3) { if(nrow(r) == 3) r <- t(r) else stop("matrix must have 3 rows or 3 columns") } } else { r <- array(c(r,g,b),c(length(r),3)) } if(max(r) > 1) r <- r/255 cc <- colors() # remove duplicates #cc <- setdiff(cc, c("gray100","grey100")) crgb <- col2rgb(cc)/255 # distance matrix d <- sqdist(r,t(crgb)) cc[apply(d, 1, which.min)] } col2hsv <- function(col) { rgb2hsv(col2rgb(col)) } rgb2hsv <- function(r,g,b) { if(missing(g)) { if(nrow(r) != 3) { if(ncol(r) == 3) r <- t(r) else stop("matrix must have 3 rows or 3 columns") } b <- r[3,] g <- r[2,] r <- r[1,] } n <- length(r) lim <- array(0,c(2,n)) lim[1,] <- pmin(r,g,b) lim[2,] <- pmax(r,g,b) rang <- lim[2,]-lim[1,] v <- lim[2,] s <- v i <- (lim[2,] > 0) s[!i] <- 0 s[i] <- rang[i]/lim[2,i] h <- v h[s == 0] <- 0 i <- (s > 0) & (r == lim[2,]) h[i] <- (g[i] - b[i])/rang[i]/6 i <- (s > 0) & (g == lim[2,]) h[i] <- (2 + (b[i]-r[i])/rang[i])/6 i <- (s > 0) & (b == lim[2,]) h[i] <- (4 + (r[i]-g[i])/rang[i])/6 i <- (h < 0) h[i] <- h[i] + 1 t(array(c(h,s,v),c(n,3),list(NULL,c("hue","saturation","value")))) } test.col2hsv <- function() { h <- col2hsv(palette()) print(palette()) print(sapply(1:ncol(h), function(i) hsv(h[1,i],h[2,i],h[3,i]))) } hsv2cone <- function(h,s,v) { if(missing(s)) { if(nrow(h) != 3) { if(ncol(h) == 3) h <- t(h) else stop("matrix must have 3 rows or 3 columns") } v <- h[3,] s <- h[2,] h <- h[1,] } x <- s*v*cos(h*2*pi) y <- s*v*sin(h*2*pi) z <- v n <- length(x) t(array(c(x,y,z),c(n,3),list(NULL,c("x","y","z")))) } cone2hsv <- function(x,y,z) { if(missing(y)) return(cone2hsv(x[1,],x[2,],x[3,])) v <- z h <- (atan2(y,x)/2/pi + 1) %% 1 s <- sqrt(x^2 + y^2)/v i <- which(s > 1 | is.na(s)) if(length(i) > 0) s[i] <- 0 r <- rbind(h,s,v) rownames(r) <- c("hue","saturation","value") r } # this is only for routines used by minka library rank.stable <- function(x) { # handles ties better than "rank" n <- length(x) r <- rep(0,n) r[order(x)] <- 1:n r } shuffle <- function(x) { if(length(dim(x)) == 2) { x[order(runif(nrow(x))),order(runif(ncol(x)))] } else { x[order(runif(length(x)))] } } named.list <- function(...,names.) { lst <- list(...) names(lst) = names. lst } named.numeric <- function(length,names.) { x <- numeric(length) names(x) = names. x } as.named.vector <- function(a) { if(is.matrix(a) || is.data.frame(a)) { x = a[,1] names(x) = rownames(a) x } else if(is.list(a)) { unlist(a) } else stop("don't know what to do") } # companions to xinch,yinch inchx = function(x,warn.log=T) { if (warn.log && par("xlog")) warning("x log scale: inchx() is non-sense") x / diff(par("usr")[1:2]) * par("pin")[1] } inchy = function (y, warn.log = TRUE) { if (warn.log && par("ylog")) warning("y log scale: inchy() is non-sense") y / diff(par("usr")[3:4]) * par("pin")[2] } inch.symbol <- function(pch=par("pch"),cex=par("cex")) { # returns the height in inches of the given plotting symbol # with current graphics settings. # this is for pch="H" inch = par("csi")*cex/1.5 if(pch <= 20) inch = inch/2 inch } format.bytes = function(x) { # returns byte sizes in pretty form # example: format.bytes(c(2,1030,2e6)) # based on code by Fang Chen round.to.half = function(x) round(x*2)/2 s = as.character(x) names(s) = names(x) kilo = 2^10; mega = 2^20 i = (x >= mega) s[i] = paste(round.to.half(x[i]/mega),"M",sep="") i = (x >= kilo) & (x < mega) s[i] = paste(round(x[i]/kilo),"k",sep="") s } ls.sizes <- function(name=parent.frame(), pretty = T, top = 10, ...) { # lists the `top' biggest objects in the given environment # suggested by Fang Chen x <- sapply(ls(name,...),function(x) object.size(get(x))) if(top > length(x)) top = length(x); x = rev(rev(sort(x))[1:top]) if(pretty) x = format.bytes(x) x <- as.data.frame(x) colnames(x) <- "bytes" x } # smallest number such that 1+eps > 1 eps <- 1e-15 # returns the elements of x not indexed by i not <- function(x,i) { if(is.numeric(i)) return(x[-i]) if(is.character(i)) return(x[setdiff(names(x),i)]) if(is.logical(i)) return(x[!i]) } sort.levels <- function(f,x,fun=median) { if(length(x) == length(f)) x <- tapply(x,f,fun) if(length(x) != length(levels(f))) stop("wrong number of values to sort by") factor(f,levels=levels(f)[order(x)]) } # n is a vector of names # i is a vector of indices # merges names n[i] into a composite name # returns a shorter vector of names merge.names <- function(n,i=1:length(n)) { if(is.dim.ordered(n)) { first <- n[i[1]] last <- n[i[length(i)]] # what kind of range notation should we use? if(F) { split <- "\\.\\."; sep <- ".." } else { split <- "-"; sep <- "-" } s1 <- unlist(strsplit(first,split)) s2 <- unlist(strsplit(last,split)) # work around a bug in strsplit if(length(s2) == 1) s2 <- c("",s2) n[i] <- paste(s1[[1]], s2[[2]], sep=sep) } else { n[i] <- paste(n[i],collapse=",") } a <- attributes(n) n <- n[-i[-1]] attributes(n) <- a n } which2 <- function(x) { i <- which(x)-1 j <- floor(i/nrow(x)) i <- i - j*nrow(x) cbind(i+1,j+1) } which.max2 <- function(x) { i <- which.max(x)-1 j <- floor(i/nrow(x)) i <- i - j*nrow(x) c(i+1,j+1) } which.min2 <- function(x) { i <- which.min(x)-1 j <- floor(i/nrow(x)) i <- i - j*nrow(x) c(i+1,j+1) } zip <- function(a,b,fun="*") { fun <- match.fun(fun) r <- lapply(1:length(a), function(i) fun(a[[i]],b[[i]])) names(r) <- names(a) r } accumulate <- function(lst,fun="+") { fun <- match.fun(fun) r <- lst[[1]] for(i in 2:length(lst)) { #r <- r + lst[[i]] r <- fun(r,lst[[i]]) } r } # returns split(x,f) where f is a random factor # if p is a single number then f has two factors in the proportion (p,1-p) rsplit <- function(x,p) { if(is.data.frame(x)) n <- nrow(x) else n <- length(x) n1 <- ceiling(p*n) i <- c(rep(1,n1),rep(2,n-n1)) split(x,sample(i)) } is.dim.ordered <- function(d) { # dimensions are ordered by default a <- attr(d,"ordered") is.null(a) || a } dim.ordered <- function(x) { dn = dimnames(x) # never return NULL if(is.null(dn)) dn = dim(x) sapply(dn,is.dim.ordered) } set.dim.ordered <- function(x,d=seq(dim(x)),v=T) { for(i in d) { attr(dimnames(x)[[i]],"ordered") <- v } x } set.dim.unordered <- function(...) set.dim.ordered(...,v=F) # convert from interval to range notation interval2range <- function(x) { x <- unlist(strsplit(x,",")) x[1] <- substr(x[1],2,nchar(x[1])) x[1] <- format(as.numeric(x[1])) if(x[1] == "-Inf") x[1] <- "." x[2] <- substr(x[2],1,nchar(x[2])-1) x[2] <- format(as.numeric(x[2])) if(x[2] == "Inf") x[2] <- "." paste(x[1],x[2],sep="-") } # used by color.plot and predict.plot given rcut <- function(x,b) { f <- cut(x,b,include.lowest=T) levels(f) <- sapply(levels(f),interval2range) f } gradient <- function(x) { n <- length(x) g <- (diff(x[1:(n-1)])+diff(x[2:n]))/2 g = c(x[2]-x[1],g,x[n]-x[n-1]) names(g) = names(x) g } scale.range <- function(x) { (x-min(x,na.rm=T))/diff(range(x,na.rm=T)) } cut.quantile <- function(x,n=3,suffix="") { #if(missing(suffix)) suffix = deparse(substitute(x)) #x[is.na(x)] <- 0 b <- quantile(unique(x),probs=seq(0,1,len=n+1),na.rm=T) if(length(suffix) == 1) { if(n == 5) labels <- c("low","med.low","med","med.high","high") else if(n == 4) labels <- c("low","med.low","med.high","high") else if(n == 3) labels <- c("low","med","high") else if(n == 2) labels <- c("low","high") else labels <- paste("cut",1:n,sep="") labels = paste(labels,suffix) } else labels = suffix cut(x,b,include=T,labels=labels) } indicators.factor <- function(y) { # convert a factor into a matrix of indicators # result is level by case r <- array(0,c(length(levels(y)),length(y)),list(levels(y),names(y))) for(lev in levels(y)) r[lev,y==lev] <- 1 r } reorder.rows <- function(y,i) { # indexes a matrix while preserving attributes if(is.na(nrow(y))) { y[i] } else { a <- attributes(y); y <- y[i,] if(!is.null(a$dimnames[[1]])) a$dimnames[[1]] <- a$dimnames[[1]][i] attributes(y) <- a y } } reorder.cols <- function(y,i) { # indexes a matrix while preserving attributes if(is.na(ncol(y))) { y[i] } else { a <- attributes(y); y <- y[,i] if(!is.null(a$dimnames[[2]])) a$dimnames[[2]] <- a$dimnames[[2]][i] attributes(y) <- a y } } ############################################################################## # Graphics tools auto.layout <- function(len) { layout <- c(1,len) din <- par("din") asp <- din[2]/din[1] if(asp > 1) { layout[2] <- round(sqrt(len/asp)) layout[1] <- ceiling(len/layout[2]) } else { layout[1] = round(sqrt(len*asp)) layout[2] = ceiling(len/layout[1]) } layout } auto.legend <- function(labels, col=1, lty, pch, text.col, ...) { # barplot could use an auto.legend usr <- par("usr") top <- usr[4] left <- usr[2]-max(strwidth(labels))-xinch(0.04) y <- cumsum(strheight(labels)+yinch(0.04)) if(missing(lty) && missing(pch)) { text.only <- T if(missing(text.col)) text.col <- col } for(i in 1:length(labels)) { text(left,top-y[i],labels[i],col=text.col[i],adj=0,...) } } auto.mar <- function(main="",xlab="x",ylab="y", xaxt=par("xaxt"),yaxt=par("yaxt"),las=par("las"),...) { mar = c(4.5,4,0,0.1) if(!is.null(main) && main != "") mar[3] = 1 if(is.null(xlab) || xlab == "") { if(xaxt == "n") mar[1] = 0.05 else mar[1] = 2.5 } if(is.null(ylab) || ylab == "") { if(yaxt == "n") mar[2] = 0 else mar[2] = 2 } if(las == 3 || las == 2) mar[1] = mar[1]*1.5 if(las == 1 || las == 2) mar[2] = mar[2]*1.5 mar } ############################################################################## # matlab compatibility rep.mat <- function(x,n,m) { array(rep(x,n*m),c(length(x)*n,m)) } rep.col <- function(x,n) { rep.mat(x,1,n) } rep.row <- function(x,n) { t(rep.col(x,n)) } eye <- function(n) { diag(rep(1,n)) } norm <- function(x) { sqrt(sum(x^2)) } # behaves like matlab "sum" dim.sum <- function(x,dim) { apply(x,setdiff(1:length(dim(x)),dim),sum) } scale.cols <- function(x,s) { x * rep.row(s,nrow(x)) } scale.rows <- function(x,s) { x * rep.col(s,ncol(x)) } ############################################################################## # data.frames hist.data.frame <- function(x,breaks,layout,...) { if(missing(layout)) layout <- auto.layout(length(x)) opar <- par(mfrow=layout) on.exit(par(opar)) for(i in names(x)) { if(missing(breaks)) breaks <- max(2,nclass.Sturges(x[[i]])) hist(x[[i]],xlab=i,main="",breaks=breaks,...) } } # sort the rows of a data frame (df) according to column f # f can also be a vector with length(f)==nrow(df) sort.data.frame <- function(df,f=ncol(df),...) { if(length(f) == 1) f <- df[[f]] else if(length(f) != nrow(df)) stop("length of sort vector doesn't match nrows") df[order(f,...),,drop=F] } sort.cells <- function(x) { sort.data.frame(as.data.frame(x)) } empty.data.frame <- function(col.names) { if(missing(col.names)) y <- NULL else { y <- sapply(col.names,function(x) NULL) } structure(y,class="data.frame") } rbind.extend <- function(df,df2) { # like rbind, but allows different numbers of columns (NAs inserted) x <- list() col.names <- unique(c(names(df),names(df2))) for(j in 1:length(col.names)) { k <- col.names[j] # must check in names first to avoid abbreviation match if(k %in% names(df)) v1 <- df[[k]] else v1 <- factor(rep(NA,nrow(df))) if(k %in% names(df2)) v2 <- df2[[k]] else v2 <- factor(rep(NA,nrow(df2))) # requires cfactor # force dispatch on second argument too x[[j]] <- cfactor(v1,v2) } names(x) <- col.names row.names <- make.unique(c(rownames(df),rownames(df2))) attr(x,"row.names") <- row.names class(x) <- "data.frame" x } rbind.extend <- function(df,df2) { if(nrow(df) == 0) return(df2) if(nrow(df2) == 0) return(df) not.1 <- setdiff(names(df2),names(df)) not.2 <- setdiff(names(df),names(df2)) if(length(not.1) > 0) df[not.1] <- rep(NA,nrow(df)) if(length(not.2) > 0) df2[not.2] <- rep(NA,nrow(df2)) col.names <- unique(c(names(df),names(df2))) x <- rbind(df[col.names],df2[col.names]) row.names <- make.unique(c(rownames(df),rownames(df2))) attr(x,"row.names") <- row.names x } cbind.extend <- function(df,df2) { # like cbind.data.frame, but pads with NA until rownames match if(is.null(df) || nrow(df) == 0) return(df2) if(is.null(df2) || nrow(df2) == 0) return(df) not.1 = setdiff(rownames(df2),rownames(df)) not.2 = setdiff(rownames(df),rownames(df2)) if(length(not.1) > 0) df[not.1,] = NA if(length(not.2) > 0) df2[not.2,] = NA r = cbind(df,df2[rownames(df),]) names(r) = c(names(df),names(df2)) r } # returns the name of the response variable # works for formulas, model frames, and fitted models response.var <- function(object) { if(is.null(object)) return(NULL) if(inherits(object, "terms")) { a <- attributes(object) if(!a$response) return(character(0)) return(as.character(a$variables[2])) } if(is.null(attr(object,"terms"))) { if(is.data.frame(object)) { # shortcut to avoid make.names return(names(object)[length(object)]) } if(is.table(object)) return(response.var(terms(object))) #if(is.list(object) || is.vector(object) || is.array(object)) return(NULL) } response.var(terms(object)) } # returns only the predictors which appear as bare terms predictor.vars <- function(object) { if(inherits(object, "terms")) { a <- attributes(object) pred = a$term.labels[a$order == 1] # drop I() terms pred = pred[substring(pred,1,2) != "I("] return(pred) } if(inherits(object,"data.frame") && is.null(attr(object,"terms"))) { # shortcut to avoid make.names return(names(object)[-length(object)]) } predictor.vars(terms(object)) } # returns all terms on the rhs, including higher-order terms predictor.terms <- function(object) { attr(terms(object),"term.labels") } as.data.frame.col <- function(x,n="V1") { frame = as.data.frame(as.matrix(x)) names(frame) = n frame } as.data.frame.row <- function(x,row.name="") { extra.args <- list(check.names=F,row.names=row.name) do.call("data.frame",append(as.list(x),extra.args)) } nocheck.data.frame <- function(...) { # must be done this way to prevent check.names (bug in data.frame) do.call("data.frame",append(as.list(...),list(check.names=F))) } apply.df <- function(x,fun) { y = lapply(x,fun) if(length(y[[1]]) == length(x[[1]])) nocheck.data.frame(y,row.names=rownames(x),check.names=F) else nocheck.data.frame(y,check.names=F) } ############################################################################## # bug fixes aggregate.data.frame <- function(x, by, FUN, ...) { if(!is.data.frame(x)) x <- as.data.frame(x) if(!is.list(by)) stop("`by' must be a list") if(is.null(names(by))) names(by) <- paste("Group", seq(along = by), sep = ".") else { nam <- names(by) ind <- which(nchar(nam) == 0) names(by)[ind] <- paste("Group", ind, sep = ".") } y <- lapply(x, tapply, by, FUN, ..., simplify = FALSE) if(any(sapply(unlist(y, recursive = FALSE), length) > 1)) stop("`FUN' must always return a scalar") z <- y[[1]] d <- dim(z) w <- list() for (i in seq(along = d)) { j <- rep(rep(seq(1 : d[i]), prod(d[seq(length = i - 1)]) * rep(1, d[i])), prod(d[seq(from = i + 1, length = length(d) - i)])) # minka: preserve original levels f <- factor(dimnames(z)[[i]][j],levels=levels(by[[i]])) w[[names(by)[i]]] = f } #w <- w[which(!unlist(lapply(z, is.null))), ] y <- data.frame(w, lapply(y, unlist, use.names = FALSE)) #names(y) <- c(names(by), names(x)) y } # concatenate factors (should be built in) # unfortunately "sort" requires the original c(), which drops labels cfactor <- function(f1,f2=NULL) { if(length(f1) == 0) return(f2) else if(length(f2) == 0) return(f1) if(!is.factor(f1) || !is.factor(f2)) return(c(f1,f2)) all.levs <- unique(c(levels(f1),levels(f2))) factor(c(as.character(f1),as.character(f2)),levels=all.levs) } # desiderata for make.unique: # 1. if A is unique, make.unique(c(A,B)) preserves A # 2. make.unique(c(A,B)) = make.unique(c(make.unique(A),B)) # internal version # does not satisfy desideratum #2 # make.unique(c("a","a","a")) != make.unique(c(make.unique(c("a","a")),"a")) make.unique <- function(names) { # names is character vector if(mode(names) != "character") stop("names must be a character vector") while(any(dups <- duplicated(names))) { names[dups] <- paste(names[dups], seq(length = sum(dups)), sep = "") } names } # satifies both desiderata # make.unique(c("a","a","a.2","a")) == # make.unique(c(make.unique(c("a","a")),"a.2","a")) make.unique <- function(names,sep=".") { # names is character vector if(mode(names) != "character") stop("names must be a character vector") repeat { i <- which(duplicated(names)) if(length(i) == 0) break # loop duplicates for(j in i) { cnt <- 2 repeat { newnam <- paste(names[j],cnt,sep=sep) # compare to previous elements only if(!any(is.element(newnam,names[1:j]))) break cnt <- cnt + 1 } names[j] <- newnam } } names } make.names <- function(names, unique=F) { names <- .Internal(make.names(as.character(names))) # minka: change keywords i <- is.element(names, c("for","while","repeat","if","else","function")) if(any(i)) names[i] <- paste(names[i],".",sep="") if(unique) names <- make.unique(names) names } # does NOT throw an error if no terms terms.default <- function(x,...) x$terms terms.data.frame <- function(x,env=parent.frame(),...) { fmla <- attr(x,"terms") if(is.null(fmla)) { # minka: assume the last column is the response nm <- make.names(names(x)) if(length(nm) > 1) { lhs <- nm[length(nm)] rhs <- nm[-length(nm)] } else { lhs <- NULL rhs <- nm } fmla <- terms(formula(paste(lhs,"~",paste(rhs,collapse="+")),env=env,...)) } fmla } formula.data.frame <- function(x,env=parent.frame(),...) { formula(terms(x,env=env),...) } terms.table <- function(x,...) { terms.data.frame(as.data.frame(x),...) } formula.default <- function (x,env=parent.frame(), ...) { if (!is.null(x$formula)) eval(x$formula) else if (!is.null(x$call$formula)) eval(x$call$formula) # minka: always return formula, not terms else if (!is.null(x$terms)) formula.terms(x$terms) else if (!is.null(attr(x, "formula"))) attr(x, "formula") else {form<-switch(mode(x), NULL = structure(NULL, class = "formula"), character = formula(eval(parse(text = x)[[1]])), call = eval(x), stop("invalid formula")) environment(form)<-env form } } update.default <- function (object, formula., ..., evaluate = TRUE) { call <- object$call if (is.null(call)) stop("need an object with call component") extras <- match.call(expand.dots = FALSE)$... if (!missing(formula.)) call$formula <- update.formula(formula(object), formula.) if(length(extras) > 0) { existing <- !is.na(match(names(extras), names(call))) ## do these individually to allow NULL to remove entries. for (a in names(extras)[existing]) call[[a]] <- extras[[a]] if(any(!existing)) { call <- c(as.list(call), extras[!existing]) call <- as.call(call) } } if(evaluate) { # minka: use environment of formula instead of parent.frame env<-environment(call$formula) if (is.null(env)) env<-parent.frame() eval(call,env) } else call } # should be built into R sum.data.frame <- function(x,...) sapply(x,function(y) sum(y,...)) mean.data.frame <- function(x,...) sapply(x,function(y) mean(y,...)) plot.default <- function(x, y=NULL, type="p", xlim=NULL, ylim=NULL, log="", main=NULL, sub=NULL, xlab=NULL, ylab=NULL, ann=par("ann"), axes=TRUE, frame.plot=axes, panel.first=NULL, panel.last=NULL, col=par("col"), bg=NA, pch=par("pch"), cex = 1, lty=par("lty"), lab=par("lab"), lwd=par("lwd"), asp=NA, ...) { xlabel <- if (!missing(x)) deparse(substitute(x)) ylabel <- if (!missing(y)) deparse(substitute(y)) # minka # treat character as factor if(mode(y) == "character") y <- factor(y,levels=unique(x)) if(mode(x) == "character") x <- factor(x,levels=unique(x)) # this works even if x and y are factors xy <- xy.coords(x, y, xlabel, ylabel, log) if(all(is.na(xy$x))) xy$x <- 1:length(x) xlab <- if (is.null(xlab)) xy$xlab else xlab ylab <- if (is.null(ylab)) xy$ylab else ylab xlim <- if (is.null(xlim)) range(xy$x[is.finite(xy$x)]) else xlim ylim <- if (is.null(ylim)) range(xy$y[is.finite(xy$y)]) else ylim plot.new() plot.window(xlim, ylim, log, asp, ...) panel.first plot.xy(xy, type, col=col, pch=pch, cex=cex, bg=bg, lty=lty, lwd=lwd, ...) panel.last if (axes) { # minka if(is.factor(x)) { lev <- levels(x) axis(1, at=1:length(lev), labels=lev, ...) } else axis(1, ...) # minka if(is.factor(y)) { lev <- levels(y) axis(2, at=1:length(lev), labels=lev, ...) } else axis(2, ...) } if (frame.plot) box(...) if (ann) title(main=main, sub=sub, xlab=xlab, ylab=ylab, ...) invisible() } model.frame.multinom <- function(object) { oc <- object$call oc[[1]] <- NULL do.call("model.frame",as.list(oc)) } model.frame.glm <- function (object, data, na.action, ...) { if (is.null(object$model)) { fcall <- object$call fcall$method <- "model.frame" #fcall[[1]] <- as.name("glm") # minka if (!is.null(object$terms)) env <- environment(object$terms) else env <- environment(fcall$formula) if (is.null(env)) env <- parent.frame() eval(fcall, env) } else object$model } model.frame.nnet <- model.frame.glm # missing from modreg model.frame.loess <- model.frame.glm barplot.default <- function(height, width = 1, space = NULL, names.arg = NULL, legend.text = NULL, beside = FALSE, horiz = FALSE, density = NULL, angle = 45, col = heat.colors(NR), border = par("fg"), main = NULL, sub = NULL, xlab = NULL, ylab = NULL, xlim = NULL, ylim = NULL, xpd = TRUE, axes = TRUE, axisnames = TRUE, cex.axis = par("cex.axis"), cex.names = par("cex.axis"), inside = TRUE, plot = TRUE, ...) { if (!missing(inside)) .NotYetUsed("inside", error = FALSE) if (!missing(border)) .NotYetUsed("border", error = FALSE) if (missing(space)) space <- if (is.matrix(height) && beside) c(0, 1) else 0.2 space <- space * mean(width) if (plot && axisnames && missing(names.arg)) names.arg <- if(is.matrix(height)) colnames(height) else names(height) if (is.vector(height)) { height <- cbind(height) beside <- TRUE } else if (is.array(height) && (length(dim(height)) == 1)) { height <- rbind(height) beside <- TRUE } else if (!is.matrix(height)) stop("`height' must be a vector or a matrix") if(is.logical(legend.text)) { if(legend.text && is.matrix(height)) legend.text <- rownames(height) else legend.text <- NULL } NR <- nrow(height) NC <- ncol(height) if (beside) { if (length(space) == 2) space <- rep(c(space[2], rep(space[1], NR - 1)), NC) width <- rep(width, length = NR * NC) } else { width <- rep(width, length = NC) height <- rbind(0, apply(height, 2, cumsum)) } delta <- width / 2 w.r <- cumsum(space + width) w.m <- w.r - delta w.l <- w.m - delta if (horiz) { if (missing(xlim)) xlim <- range(-0.01 * height, height, na.rm=TRUE) if (missing(ylim)) ylim <- c(min(w.l), max(w.r)) } else { if (missing(xlim)) xlim <- c(min(w.l), max(w.r)) if (missing(ylim)) ylim <- range(-0.01 * height, height, na.rm=TRUE) } if (beside) w.m <- matrix(w.m, nc = NC) if(plot) { ##-------- Plotting : # minka: removed xaxs="i",xpd nonsense plot.new() plot.window(xlim, ylim, ...) # Beware : angle and density are passed using R scoping rules xyrect <- function(x1,y1, x2,y2, horizontal = TRUE, ...) { if(horizontal) rect(x1,y1, x2,y2, angle = angle, density = density, ...) else rect(y1,x1, y2,x2, angle = angle, density = density, ...) } if (beside) { xyrect(0, w.l, c(height), w.r, horizontal=horiz, col = col) } else { for (i in 1:NC) { xyrect(height[1:NR, i], w.l[i], height[-1, i], w.r[i], horizontal=horiz, col = col) } } if (axisnames && !is.null(names.arg)) { # specified or from {col}names at.l <- if (length(names.arg) != length(w.m)) { if (length(names.arg) == NC) # i.e. beside (!) colMeans(w.m) else stop("incorrect number of names") } else w.m axis(if(horiz) 2 else 1, at = at.l, labels = names.arg, lty = 0, cex.axis = cex.names, ...) } if(!is.null(legend.text)) { legend.col <- rep(col, length = length(legend.text)) if((horiz & beside) || (!horiz & !beside)){ legend.text <- rev(legend.text) legend.col <- rev(legend.col) density <- rev(density) angle <- rev(angle) } xy <- par("usr") legend(xy[2] - xinch(0.1), xy[4] - yinch(0.1), legend = legend.text, angle = angle, density = density, fill = legend.col, xjust = 1, yjust = 1) } title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...) if(axes) axis(if(horiz) 1 else 2, cex.axis = cex.axis, ...) invisible(w.m) } else w.m } # Routines for contingency tables # Tom Minka barplot.table <- function(m,col=default.colors.w(nrow(m)), xlab,main, ylab=response.var(m),legend.text=T,...) { nam <- names(dimnames(m)) opar <- par(mar=auto.mar(main=nam[1])) on.exit(par(opar)) if(missing(xlab)) xlab = nam[2] if(missing(main)) main = nam[1] barplot.default(m,legend.text=legend.text,col=col,..., xlab=xlab,ylab=ylab,main=main) } flipud <- function(m) { m[nrow(m):1,] } fliplr <- function(m) { m[,ncol(m):1] } image.table <- function(m,col=YR.colors(64),bg=gray(0.5),...) { m <- flipud(m) b = break.quantile(as.vector(m),length(col)) #b = break.equal(as.vector(m),length(col)) opar = par(bg=bg) on.exit(par(opar)) image.default(t(m),axes=F,col=col,breaks=b,...) box() # axis 1 is horizontal if(is.null(colnames(m))) colnames(m) = 1:ncol(m) if(is.null(rownames(m))) rownames(m) = 1:nrow(m) axis(1,at=seq(0,1,len=ncol(m)),labels=colnames(m)) # axis 2 is vertical axis(2,at=seq(0,1,len=nrow(m)),labels=rownames(m)) nam <- names(dimnames(m)) title(xlab=nam[2],ylab=nam[1]) } if(!exists('pie.default')) { pie.default = pie pie = function(x,...) UseMethod("pie") } # draws a pie for each column pie.table <- function(m,layout,col=default.colors.w(nrow(m)),...) { if(length(dim(m)) == 1) return(pie.default(m,col=col,...)) if(missing(layout)) layout <- auto.layout(ncol(m)) opar <- par(mfrow=layout,mar=c(1,1,1,0)*2) on.exit(par(opar)) nam <- names(dimnames(m)) for(i in 1:ncol(m)) { pie(m[,i],col=col,...) title(paste(nam[1],"\n",nam[2],"=",colnames(m)[i])) } } ############################################################################## # only for 2D tables right now sort.table <- function(x,k=NULL) { if(!is.null(k)) { if(k == 1) { # sort rows only n <- margin.table(x,k)+1e-15 v2 <- 1:ncol(x) v1 <- drop(x %*% v2)/n return(x[order(v1),]) } if(k == 2) { # sort cols only n <- margin.table(x,k)+1e-15 v1 <- 1:nrow(x) v2 <- drop(v1 %*% x)/n return(x[,order(v2)]) } } n1 <- margin.table(x,1) n2 <- margin.table(x,2) n <- sum(n1) if(n == 0) return(x) # iterate v2 <- rnorm(ncol(x)) iter <- 1 repeat { old.v2 <- v2 # update rows # add eps in case there are zeros v1 <- drop(x %*% v2)/(n1+eps) # shift to zero mean v1 <- v1 - sum(v1*n1)/n # scale to unit sd v1 <- v1/sqrt(sum(v1^2*n1)/n) # update cols v2 <- drop(v1 %*% x)/(n2+eps) if(max(abs(v2 - old.v2)) < 1e-8) break iter <- iter + 1 } # remove sign ambiguity nz <- which(n1 > 0)[1] s <- sign(v1[nz]) v1 <- v1*s v2 <- v2*s # empty rows/cols get placed at end z1 <- which(n1 == 0) z2 <- which(n2 == 0) v1[z1] <- 1e10 v2[z2] <- 1e10 i1 <- order(v1) i2 <- order(v2) y <- x[i1,i2] class(y) <- class(x) y #list(v1=v1,v2=v2) } indep.fit <- function(x) { ## Compute the expected cell counts under the independence model. loglin(x, as.list(1:length(dim(x))), fit = TRUE, print = FALSE)$fit } # returns top k associations in table x # uses sort.cells, sort.data.frame, rbind.extend mine.associations <- function(x,top=10,targets=NULL,z=1) { if(!inherits(x,"class")) x <- as.table(x) dm <- dim(x) nd <- length(dm) dn <- attr(dimnames(x),"names") if(is.null(targets)) targets <- 1:nd else if(is.character(targets)) { targets <- pmatch(targets,dn) if(any(is.na(targets))) stop("unrecognized target") } not.targets <- setdiff(1:nd, targets) # create a data frame with 0 rows res <- empty.data.frame(c("Lift",dn)) # loop rows for(i in targets) { predictors <- c(not.targets, targets[targets > i]) for(j in predictors) { y <- margin.table(x, c(i,j)) df <- sort.cells((y - z*sqrt(y))/(indep.fit(y)+eps)) names(df)[3] <- "Lift" # take top k lifts if(nrow(df) > top) df <- df[nrow(df)+1-(top:1),] # extend cols for(v in 1:nd) { if(v != i && v != j) df[[dn[v]]] <- factor(rep(NA,nrow(df))) } res <- rbind.extend(res,df) # ensure unique rownames #rownames(res) <- nrow(res):1 } } res <- sort.data.frame(res,1) if(nrow(res) > top) res <- res[nrow(res)+1-(top:1),] rownames(res) <- nrow(res):1 res } ############################################################################## # merging # Returns Pearson's chi-square statistic for testing whether two samples come # from the same population. # n1 and n2 are count vectors (same length) homogeneity <- function(n1,n2) { # most time is spent in this function so it must be efficient if(sum(n1)==0 || sum(n2)==0) return(0) n12 <- n1+n2 e1 <- n12*sum(n1)/sum(n12) e2 <- n12*sum(n2)/sum(n12) # chisq approx sum((n1 - e1)^2/(e1+eps)) + sum((n2 - e2)^2/(e2+eps)) } js.divergence <- function(n1,n2) { if(sum(n1)==0 || sum(n2)==0) return(0) n12 <- n1+n2 p1 <- n1/(sum(n1)+eps) p2 <- n2/(sum(n2)+eps) p12 <- n12/(sum(n12)+eps) i <- (p1 > 0) g <- sum(p1[i]*log(p1[i]/p12[i]))*sum(n1) i <- (p2 > 0) g <- g + sum(p2[i]*log(p2[i]/p12[i]))*sum(n2) g } # returns a matrix of merging costs for the values along dimension v merge.table.cost <- function(x,v,cost=homogeneity) { ordered <- dim.ordered(x)[[v]] # make v the first dimension p <- 1:length(dim(x)) p[1] <- v p[v] <- 1 x <- aperm(x,p) d <- dim(x) # collapse all other dimensions dim(x) <- c(d[1], prod(d[2:length(d)])) if(ordered) { # compute cost between all adjacent values s <- 0 for(i in 1:(d[1]-1)) { s[i] <- cost(x[i,],x[i+1,]) } } else { # compute cost between all value pairs s <- array(0,c(d[1],d[1])) for(i in 1:nrow(s)) { for(j in 1:ncol(s)) { if(i == j) s[i,j] <- Inf else s[i,j] <- cost(x[i,],x[j,]) } } } s } test.merge.table.cost <- function(cost=homogeneity) { x <- array(c(0,0,1,0,1,1),c(3,2)) i <- rep(1:3,rep(2,3)) x <- x[i,] merge.table.cost(x,1,cost) } # x is a table # returns a new table with values i and j along dimension v merged merge.table.cells <- function(x,v,i,j) { p <- 1:length(dim(x)) p[1] <- v p[v] <- 1 x <- aperm(x,p) dn <- dimnames(x) d <- dim(x) dim(x) <- c(d[1], prod(d[2:length(d)])) x[i,] <- x[i,]+x[j,] x <- x[-j,] d[1] <- d[1] - 1 dim(x) <- d dn[[1]] <- merge.names(dn[[1]],c(i,j)) dimnames(x) <- dn # aperm removes the class attribute x <- aperm(x,p) class(x) <- "table" x } # cost is the merging cost function to use (default homogeneity) # trace is the max number of costs to report in the merging trace # # if trees are desired, returns list(table=x,trees=t) # x is the merged table # t is a list of length(ds) trees, or one tree if length(ds)==1 merge.table <- function(x, bins=rep(2,length(ds)), ds=1:length(dim(x)), cost=homogeneity, trace=12) { if(length(bins) != length(ds)) { stop("must specify the desired number of bins for each dimension being merged") } dn <- attr(dimnames(x),"names") if(is.character(ds)) { # convert ds to integers if(is.null(dn)) stop("table doesn't have category names") ds <- pmatch(ds, dn) if(any(is.na(ds))) stop("unrecognized variable") } d <- dim(x) if(F) { # initialize trees hs <- list() for(k in 1:length(ds)) { dk <- d[ds[k]] hs[[k]] <- list(merge=matrix(0,dk-1,2),height=numeric(0), order=1:dk,labels=-(1:dk),var=dn[k]) } } # which.dim[i] is the dimension which was merged at step i which.dim <- numeric(0) costs <- numeric(0) nbins <- character(0) # main loop dim.cost <- array(0,c(length(ds),1)) dim.merge <- list() iter <- 1 # has desired table been found? desired <- F repeat { d <- dim(x) # score all dimensions for(k in 1:length(ds)) { dk <- d[ds[k]] # can we make this dimension any smaller? if(dk > bins[k]) { s <- merge.table.cost(x,ds[k],cost) if(is.null(dim(s))) { # ordered case # s is a vector dim.cost[k] <- min(s) i <- which.min(s) dim.merge[[k]] <- c(i,i+1) } else { # unordered case # s is a matrix dim.cost[k] <- min(s) dim.merge[[k]] <- which.min2(s) } #cat("dim",ds[k],":",dim.cost[k],"\n") } else { dim.cost[k] <- Inf } } k <- which.min(dim.cost) if(length(k) == 0) { if(!desired) { # desired table size has been reached. desired <- T cat("total cost =",sum(costs),"\n") if(trace) { # if a full trace is wanted, keep going until 1 bin. desired.x <- x bins <- rep(1,length(ds)) } else break } else { x <- desired.x break } } else { # merge i <- dim.merge[[k]][1] j <- dim.merge[[k]][2] #print(paste("merging value", i, "of dimension",ds[k])) if(!desired) { cat("merging",dn[ds[k]],"=",dimnames(x)[[ds[k]]][i], "and",dimnames(x)[[ds[k]]][j],"\n") } x <- merge.table.cells(x,ds[k],i,j) # record which.dim[iter] <- k costs[iter] <- dim.cost[k] nbins[iter] <- paste(dim(x)[ds],collapse="x") if(F) { len <- length(hs[[k]]$height) hs[[k]]$height[len+1] <- dim.cost[k] hs[[k]]$merge[len+1,1] <- hs[[k]]$labels[i] hs[[k]]$merge[len+1,2] <- hs[[k]]$labels[j] hs[[k]]$labels[i] <- len+1 hs[[k]]$labels <- hs[[k]]$labels[-j] } iter <- iter + 1 } } #cat("merge order:", dn[ds[which.dim]], "\n") if(F) { for(k in 1:length(hc)) { hs[[k]]$height <- cumsum(hs[[k]]$height) hs[[k]]$labels <- NULL } } if(trace) { # show trace of merging cost if(length(costs) > trace) { costs <- rev(costs) costs <- costs[1:trace] costs <- rev(costs) nbins <- rev(nbins) nbins <- nbins[1:trace] nbins <- rev(nbins) } par(mai=c(1,1,0.5,0.25)) dotchart(costs,labels=nbins,xlab="merging cost",ylab="nbins") } x } # Geometry functions # rest are in maps # result is d[i,j] = squared Euclidean distance betw x[i,] and y[j,] sqdist <- function(x,y=x) { x <- as.matrix(x) y <- as.matrix(y) xmag <- rowSums(x * x) ymag <- rowSums(y * y) rep.row(ymag, nrow(x)) + rep.col(xmag, nrow(y)) - 2*(x %*% t(y)) } nearest.points <- function(a,b) { # for each point (row) in matrix A, returns the index of the closest point # in matrix B, as well as the distance to that closest point. # if B is omitted, it is assumed to be the same as A, but where a # point cannot match itself. b.which = numeric(nrow(a)) b.dist = numeric(nrow(a)) block.a = 100 i = 1 while(i <= nrow(a)) { ind = i:min(i+block.a,nrow(a)) if(missing(b)) { d = sqdist(a[ind,],a) d[(ind-1)*length(ind) + ind] = Inf } else { d = sqdist(a[ind,],b) } b.which[ind] = apply(d,1,which.min) b.dist[ind] = d[ind + (b.which[ind]-1)*nrow(d)] i = i + block.a } list(which=b.which,dist=b.dist) } distances <- function(x,fun) { if(missing(fun)) sqdist(x) else { n <- nrow(x) d <- array(NA,c(n,n),list(rownames(x),rownames(x))) for(i in 1:n) { for(j in 1:n) { d[i,j] <- fun(x[i,],x[j,]) } } d } } as.matrix.polygon <- function(p) { if(is.list(p) && !is.data.frame(p)) p <- cbind(p$x,p$y) p } as.list.polygon <- function(p) { if(is.matrix(p)) p <- data.frame(p,c("x","y")) p } # samples n points uniformly in p using rejection from bounding box # result is a matrix polygon.sample <- function(p,n) { r <- polygon.range(p) x <- NULL total.ok <- 0.9 total.tried <- 1 while(n > 0) { # m is the number of points to sample in order to have n in the polygon pct <- total.ok/total.tried m <- n + qnbinom(0.9, n, pct) xt <- cbind(runif(m,r$x[1],r$x[2]), runif(m,r$y[1],r$y[2])) # add the points which fell in the polygon to our list ok <- which(in.polygon(p,xt)) n.ok <- length(ok) if(length(ok) > n) ok <- ok[1:n] x <- rbind(x,xt[ok,]) #cat("got",n.ok,"points using pct=",pct,"\n") # update percentage and try again n <- n - n.ok total.ok <- total.ok + n.ok total.tried <- total.tried + nrow(xt) } x } # returns T if x is inside the polygon with given vertices # poly is a matrix where each row is a vertex # polygon is closed automatically # or a list of x and y (converted to a matrix) in.polygon <- function(poly,x) { poly <- as.matrix.polygon(poly) if(is.list(x) && !is.data.frame(x)) x <- cbind(x$x,x$y) if(is.vector(x)) in.polygon1(poly,x) num.above <- rep(0,nrow(x)) start <- 1 end <- nrow(poly) for(i in 1:nrow(poly)) { x1 <- poly[i,1] if(is.na(x1)) start <- i+1 else { if(i == nrow(poly) || is.na(poly[i+1,1])) { i2 <- start } else { i2 <- i+1 } x2 <- poly[i2,1] y1 <- poly[i,2] y2 <- poly[i2,2] xmax <- max(x1,x2) xmin <- min(x1,x2) # xi indexes the points within the line extent xi <- which((x[,1]>=xmin) & (x[,1]<=xmax)) # a is the height of the line at x[xi,1] a <- (x[xi,1]-x1)*(y2-y1)/(x2-x1) + y1 vert <- (x1==x2) a[vert] <- y1[vert] # j indexes the points below the line j <- (a>x[xi,2]) num.above[xi[j]] <- num.above[xi[j]] + 1 } } (num.above %% 2) == 1 } in.polygon1 <- function(poly,x) { x1 <- poly[,1] x2 <- c(poly[2:nrow(poly),1],poly[1,1]) y1 <- poly[,2] y2 <- c(poly[2:nrow(poly),2],poly[1,2]) breaks <- which(is.na(x1)) if(length(breaks) > 0) { starts <- c(1,breaks+1) ends <- c(breaks-1,nrow(poly)) x2[ends] <- x1[starts]; y2[ends] <- y1[starts] x1 <- x1[-breaks]; x2 <- x2[-breaks]; y1 <- y1[-breaks]; y2 <- y2[-breaks] } vert <- which(x1==x2) # a is the height of the lines at x[1] a <- (x[1]-x1)*(y2-y1)/(x2-x1) + y1 a[vert] <- y1[vert] xmin <- pmin(x1,x2) xmax <- pmax(x1,x2) # i indexes the lines above x i <- (a>x[2]) & (x[1] >= xmin) & (x[1] <= xmax) (sum(i) %% 2) == 1 } test.in.polygon <- function() { n <- 100 x <- cbind(runif(n),runif(n)) p <- cbind(runif(7),runif(7)) p <- p[chull(p),] # close the polygon p <- rbind(p,p[1,]) plot(x) points(p,col=2) polygon(p,border=2) if(F) { for(i in 1:nrow(x)) { if(in.polygon1(p,x[i,])) points(x[i,,drop=F],col=3) } } else { points(x[in.polygon(p,x),],col=3) } } # Routines for manipulating directed graphs # Tom Minka 10/16/01 # returns a graph with n nodes and no edges # n is an integer or a vector of node names graph <- function(n) { if(length(n) == 1) n <- 1:n # a graph is a list of vectors g <- sapply(n, function(x) numeric(0)) names(g) <- as.character(n) g } # adds an edge from i to j # returns new graph add.edge <- function(g,i,j) { # i and j can be names or numbers if(is.character(j)) j <- pmatch(j,names(g)) # node vector contains node indices, not names # this allows node names to be changed by the user via names(g) g[[i]] <- union(g[[i]],j) g } has.edge <- function(g,i,j) { if(is.character(j)) j <- pmatch(j,names(g)) j %in% g[[i]] } from <- function(g,i) { # i is a name or index # result is indices g[[i]] } is.leaf <- function(g,i) { length(g[[i]]) == 0 } leaves <- function(g,i=1) { # returns a vector of leaf indices child <- from(g,i) if(length(child) == 0) { i } else { r <- c() for(j in child) { r <- c(r,leaves(g,j)) } r } } # returns a directed graph with edges from parents to children as.graph.tree <- function(tr) { frame <- tr$frame g <- graph(rownames(frame)) p <- parents.tree(tr) for(i in 1:nrow(frame)) { if(!is.na(p[i])) g <- add.edge(g,p[i],i) } g } plot.graph.tree <- function(tr,digits=1,abbrev=F,...) { g <- as.graph.tree(tr) frame <- tr$frame labels <- as.character(frame$var) i <- (labels == "") labels[i] <- paste(response.var(tr),"=",format(frame$yval[i],digits=digits)) if(!is.null(frame$yprob)) { f = indicators.factor(frame$yval[i]) p = rowSums(frame$yprob[i,] * t(f)) labels[i] = paste(labels[i]," (",as.character(signif(100*p,2)),"%)",sep="") } if(T) { labels[i] <- paste(paste("(",as.character(frame$n[i]),")",sep=""), labels[i]) } #labels[i] <- "?" #labels[1:length(labels)] <- "?" p <- parents.tree(tr) left <- is.left.child(tr) xlevels <- attr(tr,"xlevels") if(abbrev) xlevels <- lapply(xlevels, abbreviate, minlength=abbrev) # clean up the split labels for(i in 1:nrow(frame)) { splits <- frame$splits[i,] splits1 <- substring(splits,1,1) splits2 <- substring(splits,2) if(splits1[1] == "<") { # numeric split x <- format(as.numeric(splits2),digits=digits) frame$splits[i,] <- paste(splits1,x,sep="") } else if(splits[1] != "") { # categorical split lev <- xlevels[[labels[[i]]]] for(j in 1:2) { nc <- nchar(splits2[j]) sh <- substring(splits2[j],1:nc,1:nc) frame$splits[i,j] <- paste(lev[match(sh,letters)],collapse=",") } } } branch.labels <- c() for(i in 1:length(g)) { if(!is.na(p[i])) { branch.labels[[i]] <- if(left[i]) as.character(frame$splits[p[i],1]) else as.character(frame$splits[p[i],2]) } } #branch.labels[1:length(branch.labels)] <- "?" plot.graph(g,labels=labels,branch.labels=branch.labels,...) } root.of.graph <- function(g) { # the root is the first node with no parents # result is an index, not a name setdiff(1:length(g),accumulate(g,union))[1] } sort.graph <- function(g) { depths <- node.depths(g,root.of.graph(g)) ord <- order(depths) n <- length(g) ranks <- rank.stable(depths) for(i in 1:length(g)) { g[[i]] <- ranks[g[[i]]] } g[ord] } as.graph.hclust <- function(hc) { n.leaves <- length(hc$labels) n.internal <- nrow(hc$merge) g <- graph(n.internal + n.leaves) i.internal <- 1:n.internal i.leaves <- n.internal + (1:n.leaves) names(g)[i.leaves] <- hc$labels for(i in 1:nrow(hc$merge)) { for(k in 1:2) { j <- hc$merge[i,k] if(j < 0) j <- hc$labels[-j] else j <- as.character(j) g <- add.edge(g,i,j) } } sort.graph(g) } node.depths <- function(g,i=1,depth=0,depths=c()) { child <- from(g,i) depths[[i]] <- depth for(j in child) { depths <- node.depths(g,j,depth+1,depths) } depths } bbox.graph <- function(g,i=1,heights=NULL,labels=NULL,branch.labels=NULL, space=c(1,1),inch.usr, cex=par("cex"),srt=90,orient=90,bbox=list()) { # returns c(w.usr,w.inch,h.usr,h.inch) # the meaning of "height" depends on the orientation of the plot # space is in user coordinates orient.h <- ((orient == 0) || (orient == 180)) srt.h <- (srt == 0) if(missing(inch.usr)) { pin <- par("pin") usr <- par("usr") inch.usr <- c(pin[1]/diff(usr[1:2]),pin[2]/diff(usr[3:4])) if(orient.h) inch.usr <- inch.usr[2:1] } lab <- labels[[i]] if(!is.null(lab)) { if(orient.h == srt.h) { if(orient < 135) lab <- paste(lab,"") else lab <- paste("",lab) } if(srt.h == orient.h) { lab.width <- strheight(lab, units="inches", cex=cex) lab.height <- strwidth(lab, units="inches", cex=cex) } else { lab.width <- strwidth(lab, units="inches", cex=cex) lab.height <- strheight(lab, units="inches", cex=cex) } } child <- from(g,i) if(length(child) == 0) { # leaf w.usr <- 0 w.inch <- 0 h.usr <- 0 h.inch <- 0 if(!is.null(lab)) { w.inch <- max(w.inch,lab.width) h.inch <- max(h.inch,lab.height) } } else { # not leaf w.usr <- 0 w.inch <- 0 h.usr <- 0 h.inch <- 0 height <- 0 if(!is.null(lab)) { h.inch <- lab.height height <- h.usr*inch.usr[2]+h.inch } for(j in child) { bbox <- bbox.graph(g,j,heights,labels,branch.labels,space,inch.usr, cex,srt,orient,bbox) w.usr <- w.usr + bbox[[j]][1] w.inch <- w.inch + bbox[[j]][2] h <- c(bbox[[j]][3]+space[2],bbox[[j]][4]) if(h[1]*inch.usr[2]+h[2] > height) { max.height <- h height <- h[1]*inch.usr[2]+h[2] } } w.usr <- w.usr + space[1]*(length(child)-1) h.usr <- max.height[1] h.inch <- max.height[2] } bbox[[i]] <- c(w.usr,w.inch,h.usr,h.inch) bbox } label.adj <- function(orient,srt) { if(orient == 270) { if(srt == 90) adj <- c(0,0.5) else adj <- c(0.5,-0.2) } else if(orient == 90) { if(srt == 90) adj <- c(1,0.5) else adj <- c(0.5,1) } else if(orient == 180) { if(srt == 90) adj <- c(0.5,1.2) else adj <- c(0,0.5) } else { if(srt == 90) adj <- c(0.5,-0.2) else adj <- c(1,0.5) } adj } draw.graph <- function(g,i=1,x=0,y=0, heights=NULL,labels=NULL,branch.labels=NULL, space=c(1,1), usr=par("usr"),inch.usr, cex=par("cex"),srt=90,orient=90,bbox,...) { orient.h <- ((orient == 0) || (orient == 180)) srt.h <- (srt == 0) if(missing(inch.usr)) { pin <- par("pin") inch.usr <- c(pin[1]/diff(usr[1:2]),pin[2]/diff(usr[3:4])) if(orient.h) inch.usr <- inch.usr[2:1] } if(missing(bbox)) { print(1) bbox <- bbox.graph(g,i,heights,labels,branch.labels, space,inch.usr,cex,srt,orient) } child <- from(g,i) lab <- labels[[i]] if(!is.null(lab)) { if(orient.h == srt.h) { if(orient < 135) lab <- paste(lab,"") else lab <- paste("",lab) } if(srt.h == orient.h) { lab.width <- strheight(lab, units="inches", cex=cex) lab.height <- strwidth(lab, units="inches", cex=cex) } else { lab.width <- strwidth(lab, units="inches", cex=cex) lab.height <- strheight(lab, units="inches", cex=cex) } } # label adjustment adj <- label.adj(orient,srt) # branch label adjustment branch.adj <- label.adj(if(orient.h) 270 else 0, srt) if(length(child) == 0) { # leaf if(!is.null(lab)) { if(orient.h) { text(y,x,lab,cex=cex,srt=srt,adj=adj,...) } else { text(x,y,lab,cex=cex,srt=srt,adj=adj,...) } } } else { # not leaf # xc is position of child w <- (bbox[[i]][1]*inch.usr[1] + bbox[[i]][2])/abs(inch.usr[1]) xc <- x - w/2 yc <- y - space[2] if(!is.null(lab)) { # draw node label if(length(child) == 2) { j <- child[1] w1 <- (bbox[[j]][1]*inch.usr[1] + bbox[[j]][2])/abs(inch.usr[1]) x.lab <- xc + w1/2 + w/4 + space[1]/4 # check for collision between node label and child if(lab.height > space[2]*abs(inch.usr[2])) { # place between the children x.lab <- xc + w1 + space[1]/2 } if(orient.h) { text(y,x.lab,lab,cex=cex,srt=srt,adj=adj,...) } else { text(x.lab,y,lab,cex=cex,srt=srt,adj=adj,...) } } else stop("don't know how to label with >2 children") } for(j in child) { w <- (bbox[[j]][1]*inch.usr[1] + bbox[[j]][2])/abs(inch.usr[1]) xc <- xc + w/2 # draw line to child if(orient.h) { segments(y,x,y,xc,...) segments(y,xc,yc,xc,...) } else { segments(x,y,xc,y,...) segments(xc,y,xc,yc,...) } # draw branch label lab <- branch.labels[[j]] if(!is.null(lab)) { y.lab <- (y+yc)/2 if(orient.h) { text(y.lab,xc,lab,cex=cex,srt=srt,adj=branch.adj,...) } else { text(xc,y.lab,lab,cex=cex,srt=srt,adj=branch.adj,...) } } #cat("node",i,"child",j,"at yc=",yc,"\n") draw.graph(g,j,xc,yc,heights,labels,branch.labels,space=space, usr,inch.usr,cex,srt,orient,bbox,...) xc <- xc + w/2 + space[1] } } } plot.graph <- function(g,labels=NULL,branch.labels=NULL,orient=180,srt=0, main="",...) { if(!match(srt,c(0,90))) stop("srt must be 0 or 90") if(!match(orient,c(0,90,180,270))) stop("orient must be 0, 90, 180, or 270") opar <- par(mar=c(0.05,0,if(main=="") 0 else 1,0.05)) on.exit(par(opar)) plot.new() pin <- par("pin") if(orient == 0 || orient == 180) pin <- pin[2:1] bbox <- bbox.graph(g,labels=labels,branch.labels=branch.labels, orient=orient,srt=srt,...) #print(bbox) w.usr <- bbox[[1]][1] w.inch <- bbox[[1]][2] h.usr <- bbox[[1]][3] h.inch <- bbox[[1]][4] w.frac <- w.inch/pin[1] w <- w.usr/(1 - w.frac) h.frac <- h.inch/pin[2] h <- h.usr/(1 - h.frac) if(orient == 270) { usr <- c(-w/2,w/2,0,-h) } else if(orient == 90) { usr <- c(-w/2,w/2,-h,0) } else if(orient == 180) { usr <- c(0,-h,-w/2,w/2) } else { usr <- c(-h,0,-w/2,w/2) } plot.window(usr[1:2],usr[3:4]) title(main) space <- c(sign(w),sign(h)) draw.graph(g,labels=labels,branch.labels=branch.labels, bbox=bbox,orient=orient,srt=srt,space=space, usr=usr,...) #rect(usr[1],usr[3],usr[2],usr[4]) } odds.ratios <- function(x) { # returns the log odds-ratio for all cells in the table n <- sum(x) cs <- rep.row(colSums(x),nrow(x)) rs <- rep.col(rowSums(x),ncol(x)) log(x/(rs-x)/(cs-x)*(n-rs-cs+x)) } odds.ratios.se <- function(x) { # returns the standard error of each log odds-ratio n <- sum(x) cs <- rep.row(colSums(x),nrow(x)) rs <- rep.col(rowSums(x),ncol(x)) sqrt(1/x + 1/(rs-x) + 1/(cs-x) + 1/(n-rs-cs+x)) } discriminate.node <- function(g,i=1,x) { k <- names(g)[leaves(g,i)] if(length(k) == 0) return("") not.k <- setdiff(rownames(x),k) if(length(not.k) == 0) return("") if(!is.character(i)) i <- names(g)[i] not.i <- paste("NOT",i) xp <- array(0,c(2,ncol(x)),list(c(i,not.i),colnames(x))) xp[1,] <- colSums(x[k,,drop=F]) xp[2,] <- colSums(x[not.k,,drop=F]) s <- score.features(xp,type="odds") f <- colnames(x)[which.max(s)] #print(subtable(xp,1,f)) f } discriminate.node <- function(g,i=1,x) { child <- from(g,i) if(length(child) < 2) return("") leaf1 <- names(g)[leaves(g,child[1])] leaf2 <- names(g)[leaves(g,child[2])] leaf3 <- setdiff(rownames(x),c(leaf1,leaf2)) if(length(leaf3) == 0) return("") xp <- array(0,c(3,ncol(x)),list(c(1,2,3),colnames(x))) xp[1,] <- colSums(x[leaf1,,drop=F]) xp[2,] <- colSums(x[leaf2,,drop=F]) xp[3,] <- colSums(x[leaf3,,drop=F]) xp <- xp + 1e-10*rep.col(rowSums(xp),ncol(xp)) s12 <- abs(odds.ratios(xp[1:2,])[1,]) # no abs here s13 <- odds.ratios(xp[c(1,3),])[1,] s23 <- odds.ratios(xp[2:3,])[1,] s12.se <- odds.ratios.se(xp[1:2,])[1,] s13.se <- odds.ratios.se(xp[c(1,3),])[1,] s23.se <- odds.ratios.se(xp[2:3,])[1,] #s <- pmin(s12/s13,s12/s23) #s = s12 - pmin(s13,s23) s = -pmin(s13,s23) a = 0 s = log(exp(-(s13-s13.se*a))+exp(-(s23-s23.se*a))) #s = s12+s12.se - pmin(s13+s13.se,s23+s23.se) f <- colnames(x)[which.min(s)] if(F) { cat(f," ",xp[,f],"\n") ord = order(s)[1:5] for(i in 1:length(ord)) { j = colnames(x)[ord[i]] cat(" ",j," ",xp[,j]," ",s[j],s12[j],s13[j],s23[j],"\n") if(f == "abstraction") { print(subtable(xp,1,j)) } } } f } ############################################################################## # knn # use these to preprocess categorical predictors for knn code.factor <- function(x,y) { if(!is.factor(x)) return(x) if(is.factor(y)) { tab = table(x,y) p = tab[,2]/(tab[,1]+tab[,2]) } else { p = as.vector(as.matrix(prototypes(y,x))) #p = p - mean(p) names(p) = levels(x) } print(round(sort(p),2)) p[x] } code.factors <- function(x,type=c("effect","indicator")) { # returns a new data frame where all categorical predictors are numerically # coded. resp = response.var(x) type = match.arg(type) if(type == "indicator") { cbind(data.frame(model.matrix(terms(x),x)[,-1]),x[resp]) } else { y = x[[resp]] pred = predictor.vars(x) for(k in pred) { if(is.factor(x[[k]])) cat(k,":\n") x[[k]] = code.factor(x[[k]],y) } x } } knn.model <- function(formula,data=NULL,k=1) { # works for Splus too library(class) if(is.null(data)) data <- formula else data <- model.frame(formula,data) pred <- predictor.vars(data) if(!all(sapply(data[pred],is.numeric))) stop("all predictors must be numeric") object <- list(terms=terms(data),data=data,k=k) class(object) <- "knn" object$scale <- sd(data[pred]) object$scale[object$scale == 0] = 1 resp <- response.var(data) p <- table(data[[resp]]) object$tie.breaker <- levels(data[[resp]])[which.max(p)] object } print.knn <- function(object) { cat("nearest neighbor classifier, k =",format(object$k),"\n") pred <- predictor.vars(object) cat(nrow(object$data), "examples in", length(pred),"dimensions\n") } model.frame.knn <- function(object) { object$data } predict.knn <- function(object,test,k,type=c("class","vector")) { train <- object$data resp <- response.var(object) pred <- predictor.vars(object) s <- object$scale # subtracting mean is irrelevant x <- scale(train[pred],center=F,scale=s) xt <- scale(test[pred],center=F,scale=s) if(missing(k)) k <- object$k type <- match.arg(type) r <- knn(x,xt,train[[resp]],k=k,l=floor(k/2+1),prob=(type=="vector")) r[is.na(r)] <- object$tie.breaker if(type == "vector") { p <- attr(r,"prob") lev <- levels(train[[resp]]) v <- array(0,c(length(r),length(lev)),list(rownames(test),lev)) for(j in 1:length(lev)) { i <- (r == lev[j]) v[i,j] <- p[i] v[i,-j] <- 1-p[i] } r <- v } r } misclass.knn <- function(object,data,rate=F) { if(missing(data)) data <- model.frame(object) resp <- response.var(object) r = sum(predict(object,data) != data[[resp]]) if(rate) r/nrow(data) else r } # should use deviance, not misclass best.k.knn <- function(object,ks=1:50) { # find k with best cross-validation performance # uses leave-one-out only train <- object$data resp <- response.var(object) pred <- predictor.vars(object) s <- object$scale x <- scale(train[pred],center=F,scale=s) max.k = max(table(train[[resp]])) ks = ks[ks <= max.k] r <- c() for(i in 1:length(ks)) { y <- knn.cv(x,train[[resp]],k=ks[i],l=floor(ks[i]/2+1)) y[is.na(y)] <- object$tie.breaker r[i] <- sum(y != train[[resp]]) } plot(ks,r,xlab="k",ylab="misclass",type="o") i <- min(which(r==min(r))) cat("best k is",ks[i],"\n") object$k <- ks[i] object } test.k.knn <- function(object,test,ks=1:50) { # find k with fewest test errors train <- object$data resp <- response.var(object) pred <- predictor.vars(object) s <- object$scale x <- scale(train[pred],center=F,scale=s) xt = scale(test[pred],center=F,scale=s) max.k = max(table(train[[resp]])) ks = ks[ks <= max.k] r = c() for(i in 1:length(ks)) { y = knn(x,xt,train[[resp]],k=ks[i],l=floor(ks[i]/2+1)) y[is.na(y)] <- object$tie.breaker r[i] = sum(y != test[[resp]]) } plot(ks,r,xlab="k",ylab="misclass",type="o") i <- which.min(r) cat("best k on test is",ks[i],"\n") object$k <- ks[i] object names(r) = ks r } # score predictors via leave-one-out # uses sort.data.frame drop1.knn <- function(object) { train <- object$data resp <- response.var(object) pred <- predictor.vars(object) s <- object$scale x <- scale(train[pred],center=F,scale=s) x <- data.frame(x) yt <- train[[resp]] # score original model y <- knn.cv(x,yt,k=object$k,l=floor(object$k/2+1)) y[is.na(y)] <- object$tie.breaker e <- sum(y != yt) res <- data.frame(k=object$k,misclass=e,row.names="") # score each reduced model ks = 1:20 for(i in 1:length(pred)) { e = c() for(j in 1:length(ks)) { k = ks[j] y <- knn.cv(not(x,pred[i]),yt,k=k,l=floor(k/2+1)) y[is.na(y)] <- object$tie.breaker e[j] <- sum(y != yt) } k = ks[which.min(e)] e = min(e) res <- rbind(res, data.frame(k=k,misclass=e,row.names=paste("-",pred[i]))) } sort.data.frame(res) } # remove predictors, using drop1.knn step.knn <- function(object) { resp <- response.var(object) repeat { a <- drop1.knn(object) i <- which(a$misclass == min(a$misclass)) i <- i[rownames(a)[i] != ""] if(length(i) == 0) { print(a) break } i = i[1] pred <- predictor.vars(object) omit <- substring(rownames(a)[i], 3) cat("dropping",omit,"\n") pred <- pred[!(pred %in% omit)] object$data <- not(object$data, omit) object$scale <- not(object$scale, omit) object$k = a$k[i] object$terms <- terms(formula(paste(resp,"~",paste(pred,collapse="+")))) } object } reduce.knn <- function(object) { train <- object$data resp <- response.var(object) pred <- predictor.vars(object) s <- object$scale x <- scale(train[pred],center=F,scale=s) y <- train[[resp]] if(object$k == 1) { if(F) keep <- condense(x,y,trace=F) else keep <- 1:nrow(x) keep <- reduce.nn(x,keep,y) } else { keep <- multiedit(x,y,object$k) } object$data <- object$data[keep,] if(object$k > nrow(object$data)) { object$k <- nrow(object$data)-1 cat("changing k to",object$k,"\n") } object } proto.knn <- function(object,np=1) { # reduce the examples to np cluster centers in each class library(mva) train <- object$data resp <- response.var(object) pred <- predictor.vars(object) s <- object$scale x <- scale(train[pred],center=F,scale=s) y <- train[[resp]] nc = nlevels(y) if(length(np) == 1) { np = rep(np,nc) if(F) { p = table(y) p = p/min(p) np = round(np*p) print(np) } } if(is.null(names(np))) names(np) = levels(y) r = list() for(lev in levels(y)) { x.lev = x[y==lev,] if(np[[lev]] == 1) { r[[lev]] = as.data.frame.row(mean.data.frame(x.lev),"1") } else if(np[[lev]] == nrow(x.lev)) { r[[lev]] = x.lev } else { cl = kmeans(x.lev,np[[lev]]) r[[lev]] = as.data.frame(cl$centers) } rownames(r[[lev]]) = paste(lev,rownames(r[[lev]]),sep=".") } x = scale(rbind(r[[1]],r[[2]]),center=F,scale=1/s) y = rep(levels(y),times=sapply(r,nrow)) object$data = cbind(y,x) names(object$data)[1] = resp object } # Useful functions for making maps # convert census COB file into map object # if centers=T, returns a data frame with columns x and y # note: names may have duplicates (multi-polygon tracts) read.cob <- function(pfile,pafile=NULL,centers=F) { s <- scan(pfile,"",quiet=T) # remove last two ENDs s <- s[-length(s)] s <- s[-length(s)] breaks <- which(s == "END") index <- s[c(1,breaks+1)] if(is.null(pafile)) { nam <- index } else { spa <- scan(pafile,"",quiet=T) nam <- spa[seq(2,length(spa),by=2)] names(nam) <- spa[seq(1,length(spa),by=2)] # order names to match the p file # tracts with bogus index will have name "NA" nam <- nam[index] if(centers) { # make names unique dup <- which(duplicated(nam)) for(i in dup) { if(!is.na(nam[i])) { j <- which(nam == nam[i]) nam[j] <- paste(nam[j],as.character(1:length(j)),sep=":") } } } } if(centers) { # keep only first two coordinates of each block x <- as.numeric(s[c(2,breaks+2)]) y <- as.numeric(s[c(3,breaks+3)]) i <- !is.na(nam) data.frame(x=x[i],y=y[i],row.names=nam[i]) } else { # change END to NA,NA s[c(breaks,breaks+1)] <- "NA" # remove first two coordinates of each block i <- rep(T,length(s)) i[1:3] <- F i[c(breaks+2, breaks+3)] <- F s <- as.numeric(s[i]) list(x=s[seq(1,length(s),by=2)],y=s[seq(2,length(s),by=2)],names=nam) } } # convert a FIPS code into a comma-separated region name fips.split = function(x) { sapply(x, function(s) paste(substring(s,1,5),substring(s,6),sep=",")) } # choropleth map map.vector <- function(m,x,main="",nlevels=5,key=F, col=color.palette(nlevels),color.palette=YlGnBu.colors, bg=gray(0.5),mar,...) { if(is.data.frame(x)) { df <- x; x <- df[[1]]; names(x) <- rownames(df) if(missing(main)) main <- colnames(df)[1] } if(length(setdiff(names(x),m$names)) > 0) { warning("some regions have data but do not appear on the map") } #print(setdiff(m$names,names(x))) j <- map.match(m,names(x),warn=F) x <- as.numeric(x) b <- break.quantile(x,nlevels,pretty=T) a <- cut(x,b,include=T) if(missing(mar)) { mar = c(0,0,0,0.1) mar[3] = if(key) 2 else 1 } opar = par(mar=mar) on.exit(par(opar)) map(m,fill=T,color=col[as.numeric(a)[j]],res=0,bg=bg,mar=mar,...) if(key) { color.key(col,breaks=b) title(main,line=1) } else title(main) } mosaicplot <- function(x, ...) UseMethod("mosaicplot") ### Changes by MM: ## - NULL instead of NA for default arguments, etc [R / S convention] ## - plotting at end; cosmetic ## - mosaic.cell(): ### Changes by KH: ## Shading of boxes to visualize deviations from independence by ## displaying sign and magnitude of the standardized residuals. ### Changes by minka: ## Correct label placement and margins. # Changes for Splus: # mosaic.cell must be external, needs shade,rev,rot arguments. # main cannot be NULL # adj cannot be a vector # draw border of polygon separately, to get line style. mosaicplot.default <- function(X, main = "", xlab = NULL, ylab = NULL, sort = NULL, space = NULL, dir = NULL, color = FALSE, shade = FALSE, margin = NULL, type = c("pearson", "deviance", "FT"), rev.y = F, cex=1, rotate = F, ...) { # allocate space for the first column of X, then recurse. # relies on (1,1000) user coordinates. # X is a matrix containing indices and data. # space[1] is the percentage spacing. # maxdim[1] is the cardinality. # label[[1]] are the values. # (x1,y1)(x2,y2) are the corners of the plotting region. # label.x is the position of row labels # label.y is the position of column labels mosaic.cell <- function(X, x1, y1, x2, y2, space, dir, color, label.x, label.y, maxdim, currlev, label, shade) { # p is the column containing counts p <- ncol(X) - 2 if (dir[1] == "v") { # split here on the X-axis. xdim <- maxdim[1] # XP are the column widths XP <- rep(0, xdim) for (i in 1:xdim) { XP[i] <- sum(X[X[,1]==i,p]) / sum(X[,p]) } white <- space[1] * (x2 - x1) / max(1, xdim-1) # (x.l,x.r)[i] is the drawing region for column i x.l <- x1 x.r <- x1 + (1 - space[1]) * XP[1] * (x2 - x1) if (xdim > 1) { for (i in 2:xdim) { x.l <- c(x.l, x.r[i-1] + white) x.r <- c(x.r, x.r[i-1] + white + (1 - space[1]) * XP[i] * (x2 - x1)) } } if (label.y > 0) { this.lab <- if (is.null(label[[1]][1])) { paste(rep(as.character(currlev), length(currlev)), as.character(1:xdim), sep=".") } else label[[1]] # minka text(x= x.l + (x.r - x.l) / 2, y= label.y, srt=0, adj=c(.5,1), cex=cex, this.lab) h <- max(strheight(this.lab,cex=cex)) label.y <- label.y - h } if (p > 2) { # recursive call. for (i in 1:xdim) { if (XP[i] > 0) { mosaic.cell(as.matrix(X[X[,1]==i, -1]), x.l[i], y1, x.r[i], y2, space[-1],dir[-1], color, (i==1)*label.x, label.y, maxdim[-1], currlev+1, label[-1], shade) } else { segments(rep(x.l[i],3), y1+(y2-y1)*c(0,2,4)/5, rep(x.l[i],3), y1+(y2-y1)*c(1,3,5)/5) } } } else { # ncol(X) <= 1 : final split polygon and segments. for (i in 1:xdim) { if (XP[i] > 0) { polygon(c(x.l[i], x.r[i], x.r[i], x.l[i]), c(y1, y1, y2, y2), lty = if(shade) X[i, p+1] else 1, col = if(shade) { color[X[i, p+2]] } else color[i]) if(F) { segments(c(rep(x.l[i],3),x.r[i]), c(y1,y1,y2,y2), c(x.r[i],x.l[i],x.r[i],x.r[i]), c(y1,y2,y2,y1)) } } else { segments(rep(x.l[i],3), y1+(y2-y1)*c(0,2,4)/5, rep(x.l[i],3), y1+(y2-y1)*c(1,3,5)/5) } } } } else { ## dir[1] - "horizontal" : split here on the Y-axis. ydim <- maxdim[1] # YP are the row heights YP <- rep(0, ydim) for (j in 1:ydim) { YP[j] <- sum(X[X[,1]==j,p]) / sum(X[,p]) } if(rev.y) YP <- rev(YP) white <- space[1] * (y2 - y1) / max(1, ydim - 1) # (y.b,y.t)[i] is the drawing region for row i y.b <- y2 - (1 - space[1]) * YP[1] * (y2 - y1) y.t <- y2 if (ydim > 1) { for (j in 2:ydim) { y.b <- c(y.b, y.b[j-1] - white - (1 - space[1]) * YP[j] * (y2 - y1)) y.t <- c(y.t, y.b[j-1] - white) } } if (label.x > 0) { this.lab <- if (is.null(label[[1]][1])) { paste(rep(as.character(currlev), length(currlev)), as.character(1:ydim), sep=".") } else label[[1]] # minka if(rev.y) this.lab <- rev(this.lab) if(rotate) { text(x= label.x, y= y.b + (y.t - y.b) / 2, srt=90, adj=c(.5,1), cex=cex, this.lab) h <- max(strheight(this.lab,cex=cex,units="inch")) h = h/par("pin")[1]*diff(par("usr")[1:2]) label.x <- label.x + h } else { w <- max(strwidth(this.lab,cex=cex)) label.x <- label.x + w text(x= label.x, y= y.b + (y.t - y.b) / 2, srt=0, adj=1, cex=cex, this.lab) } #if(label.x > left.limit) label.x <- 0 } if (p > 2) { # recursive call. for (j in 1:ydim) { if (YP[j] > 0) { mosaic.cell(as.matrix(X[X[,1]==j,2:(p+2)]), x1, y.b[j], x2, y.t[j], space[-1], dir[-1], color, label.x, (j==1)*label.y, maxdim[-1], currlev+1, label[-1], shade) } else { segments(x1+(x2-x1)*c(0,2,4)/5, rep(y.b[j],3), x1+(x2-x1)*c(1,3,5)/5, rep(y.b[j],3)) } } } else { # ncol(X) <= 1: final split polygon and segments. for (j in 1:ydim) { if (YP[j] > 0) { polygon(c(x1,x2,x2,x1), c(y.b[j],y.b[j],y.t[j],y.t[j]), lty = if(shade) X[j, p+1] else 1, col = if(shade) { color[X[j, p+2]] } else color[j]) if(F) { segments(c(x1,x1,x1,x2), c(y.b[j],y.b[j],y.t[j],y.t[j]), c(x2,x1,x2,x2), c(y.b[j],y.t[j],y.t[j],y.b[j])) } } else { segments(x1+(x2-x1)*c(0,2,4)/5, rep(y.b[j],3), x1+(x2-x1)*c(1,3,5)/5, rep(y.b[j],3)) } } } } } ##-- Begin main function if(is.null(dim(X))) X <- as.array(X) else if(is.data.frame(X)) X <- data.matrix(X) dimd <- length(dX <- dim(X)) if(dimd == 0 || any(dX == 0)) stop("`X' must not have 0 dimensionality") ##-- Set up `Ind' matrix : to contain indices and data Ind <- 1:dX[1] if(dimd > 1) { Ind <- rep(Ind, prod(dX[2:dimd])) for (i in 2:dimd) { Ind <- cbind(Ind, c(matrix(1:dX[i], byrow=TRUE, nr = prod(dX[1:(i-1)]), nc = prod(dX[i:dimd])))) } } Ind <- cbind(Ind, c(X)) ## Ok, now the columns of `Ind' are the cell indices (which could ## also have been created by `expand.grid()' and the corresponding ## cell counts. We add two more columns for dealing with *EXTENDED* ## mosaic plots which are produced unless `shade' is FALSE, which ## currently is the default. These columns have NAs for the simple ## case. Otherwise, they specify the line type (1 for positive and ## 2 for negative residuals) and color (by giving the index in the ## color vector which ranges from the ``most negative'' to the ## ``most positive'' residuals. if(is.logical(shade) && !shade) { Ind <- cbind(Ind, NA, NA) } else { if(is.logical(shade)) shades <- c(2, 4) else { if(any(shade <= 0) || length(shade) > 5) stop("invalid shade specification") shades <- sort(shade) shade <- T } breaks <- c(-Inf, - rev(shades), 0, shades, Inf) if(T) { color <- c(hsv(0, # red s = seq(1, to = 0, length = length(shades) + 1)), hsv(4/6, # blue s = seq(0, to = 1, length = length(shades) + 1))) } else { # for S color <- c(6, 6, 0, # red 0, 5, 5) # blue } if(is.null(margin)) margin <- as.list(1:dimd) ## Fit the loglinear model. E <- loglin(X, margin, fit = TRUE, print = FALSE)$fit ## Compute the residuals. type <- match.arg(type) residuals <- switch(type, pearson = (X - E) / sqrt(E), deviance = { tmp <- 2 * (X * log(ifelse(X==0, 1, X/E)) - (X-E)) tmp <- sqrt(pmax(tmp, 0)) ifelse(X > E, tmp, -tmp) }, FT = sqrt(X) + sqrt(X + 1) - sqrt(4 * E + 1)) ## And add the information to the data matrix. Ind <- cbind(Ind, c(1 + (residuals < 0)), as.numeric(cut(residuals, breaks))) } ## The next four may all be NULL: label <- dimnames(X) nam.dn <- names(label) if(is.null(space)) space <- rep(10, length=dimd) if(is.null(dir)) dir <- rep(c("v","h"), length=dimd) if(is.null(xlab)) xlab <- paste(nam.dn[dir=="v"],collapse=" / ") if(is.null(ylab)) ylab <- paste(nam.dn[dir=="h"],collapse=" / ") if (!is.null(sort)) { if(length(sort) != dimd) stop("length(sort) doesn't conform to dim(X)") ## Sort columns. Ind[,1:dimd] <- Ind[,sort] space <- space[sort] dir <- dir[sort] label <- label[sort] } ncolors <- length(tabulate(Ind[,dimd])) if(!shade && ((is.null(color) || length(color) != ncolors))) { color <- if (is.null(color) || !color[1]) rep(0, ncolors) else 2:(ncolors+1) } ##-- Plotting # leave space for labels, but not tick marks opar <- par(mar = c(1.1,1,0,0.05), mgp = c(0, 0, 0)) #opar <- par(mar = c(0.1,0,0,0.05), mgp = c(1,1,0)) on.exit(par(opar)) plot.new() if(!shade) { # make the limits (1,1000) exactly par(usr=c(1,1000,1,1000)) } else { ## This code is extremely ugly, and certainly can be improved. ## In the case of extended displays, we also need to provide a ## legend for the shading and outline patterns. The code works ## o.k. with integer breaks in `shade'; rounding to two 2 digits ## will not be good enough if `shade' has length 5. pin <- par("pin") rtxt <- "Standardized\nResiduals:" ## Compute cex so that the rotated legend text does not take up ## more than 1/12 of the of the plot region horizontally and not ## more than 1/4 vertically. rtxtCex <- min(1, pin[1] / (strheight(rtxt, units = "i") * 12), pin[2] / (strwidth (rtxt, units = "i") / 4)) rtxtWidth <- 0.1 # unconditionally ... ## We put the legend to the right of the third axis. # this is same as par(usr=) par(usr=c(1, 1000 * (1.1 + rtxtWidth), 1, 1000)) rtxtHeight <- strwidth(rtxt, units = "i", cex = rtxtCex) / pin[2] text(1000 * (1.05 + 0.5 * rtxtWidth), 0, labels = rtxt, adj = c(0, 0.25), srt = 90, cex = rtxtCex) ## `len' is the number of positive or negative intervals of ## residuals (so overall, there are `2 * len') len <- length(shade) + 1 ## `bh' is the height of each box in the legend (including the ## separating whitespace bh <- 0.95 * (0.95 - rtxtHeight) / (2 * len) x.l <- 1000 * 1.05 x.r <- 1000 * (1.05 + 0.7 * rtxtWidth) y.t <- 1000 * rev(seq(from = 0.95, by = - bh, length = 2 * len)) y.b <- y.t - 1000 * 0.8 * bh ltype <- c(rep(2, len), rep(1, len)) for(i in 1 : (2 * len)) { polygon(c(x.l, x.r, x.r, x.l), c(y.b[i], y.b[i], y.t[i], y.t[i]), col = color[i], lty = ltype[i]) } brks <- round(breaks, 2) y.m <- y.b + 1000 * 0.4 * bh text(1000 * (1.05 + rtxtWidth), y.m, c(paste("<", brks[2], sep = ""), paste(brks[2 : (2 * len - 1)], brks[3 : (2 * len)], sep = ":"), paste(">", brks[2 * len], sep = "")), srt = 90, cex = cex) } if (!is.null(main) || !is.null(xlab) || !is.null(ylab)) title(main, xlab=xlab, ylab=ylab, ...) # compute margins top.margin = right.margin = 0 for(k in which(dir == "v")) { top.margin = top.margin + max(strheight(label[[k]],cex=cex)) # right.margin must accommodate half of the last label # (in the worst case) w = strwidth(label[[k]],cex=cex) right.margin = max(right.margin,w[length(w)]/2) } # 0.05 in between text and plot top.margin = top.margin + 0.05/par("pin")[2]*diff(par("usr")[3:4]) left.margin = 0 for(k in which(dir == "h")) { if(rotate) { h = max(strheight(label[[k]],cex=cex,units="inch")) h = h/par("pin")[1]*diff(par("usr")[1:2]) left.margin = left.margin + max(h) } else { w = strwidth(label[[k]],cex=cex) left.margin = left.margin + max(w) } } left.margin = left.margin + 0.05/par("pin")[1]*diff(par("usr")[1:2]) # x2 is not exactly 1000 to accommodate labels which run off the end mosaic.cell(Ind, x1=left.margin, y1=1, x2=1000-right.margin, y2=1000-top.margin, space/100, dir, color, par("usr")[1], par("usr")[4], maxdim= apply(as.matrix(Ind[,1:dimd]), 2, max), currlev= 1, label, shade) } # extra routines for multivariate analysis # Tom Minka 12/3/01 eigen2.top <- function(A,B,n=1,x,tol=1e-10) { d <- nrow(A) if(missing(x)) x <- array(rnorm(d),c(d,1)) U <- chol(B) for(iter in 1:100) { x0 <- x Ax0 <- A %*% x0 x <- backsolve(U,Ax0,transpose=T) x <- backsolve(U,x) x <- x/sqrt(sum(x*x)) if(max(abs(x - x0)/max(abs(x))) < tol) break } e <- c() for(i in 1:ncol(x)) { xAx <- t(x[,i]) %*% A %*% x[,i] xBx <- t(x[,i]) %*% B %*% x[,i] e[i] <- xAx/xBx } list(values=e,vectors=x) } solve.chol <- function(a,b) { ch <- chol(a) backsolve(ch,backsolve(ch,b,transpose=T)) } # returns x which solves a*x = b*x*eigenvalues # k is desired number of eigenvectors (optional) eigen2 <- function(a,b,orth=NULL,k) { # solve(b,a) is same as inv(b)*a #iba <- solve.chol(b,a) iba <- solve(b,a) if(is.null(orth)) e <- La.eigen(iba,symmetric=F) else { if(is.vector(orth)) orth <- array(orth,c(length(orth),1)) n <- nrow(orth) orth.k <- ncol(orth) orth.m <- eye(n) - (orth %*% t(orth)) x <- orth.m %*% iba %*% orth.m e <- La.eigen(x,symmetric=F) # remove zero eigenvalues from result e$vectors <- e$vectors[,1:(ncol(x)-orth.k),drop=F] e$values <- e$values[1:(ncol(x)-orth.k)] } if(!missing(k)) { e$vectors <- e$vectors[,1:k,drop=F] e$values <- e$values[1:k,drop=F] } # remove trivial imaginary parts e$vectors <- Re(e$vectors) e$values <- Re(e$values) e } ml.cov <- function(x) { n <- nrow(x) v <- cov(x) if(n == 1) v <- array(0,dim(v)) # avoid singularity a <- max(eigen(v,only=T)$values) (v*(n-1) + eye(ncol(x))*a*1e-5)/n } ############################################################################# project <- function(x,w) { pred <- rownames(w) xw <- data.frame(data.matrix(x[pred]) %*% w) other <- setdiff(colnames(x),pred) if(length(other) > 0) cbind(xw, x[other]) else xw } standardize.projection <- function(w) { # break symmetries by making early dimensions positive s = (nrow(w):1) %*% w for(j in 1:ncol(w)) { if(s[j] < 0) w[,j] <- -w[,j] } #print(t(w)%*%w) w } pca <- function(x,k=1) { x <- as.data.frame(x) x <- data.matrix(x[sapply(x,is.numeric)]) s <- svd(x) cat("R-squared =",format(sum(s$d[1:k]^2)/sum(s$d^2)),"\n") w <- s$v[,1:k] if(k == 1) dim(w) <- c(length(w),1) rownames(w) <- colnames(x) colnames(w) <- paste("h",1:ncol(w),sep="") standardize.projection(w) } pca2 <- function(x,k=1) { # PCA vis Roweis's EM algorithm # takes O(dnk) time x <- as.data.frame(x) x <- data.matrix(x[sapply(x,is.numeric)]) x <- t(x) d <- nrow(x) n <- ncol(x) w <- array(rnorm(d*k),c(d,k)) # EM for(iter in 1:100) { old.w <- w if(d >= n) { # h is k by n # flops is kdk + kdn + kkn + dnk + knk + kkn = O(dnk) h <- solve.chol(t(w) %*% w, t(w) %*% x) w <- x %*% t(solve.chol(h %*% t(h), h)) } else { # flops is kdk + kdn + kkd + knd + knk + kkd = O(dnk) h <- solve.chol(t(w) %*% w, t(w)) %*% x w <- t(solve.chol(h %*% t(h), h %*% t(x))) } if(max(abs(w - old.w)) < 1e-5) break } if(iter == 100) warning("not enough iterations") # postprocessing w <- qr.Q(qr(w)) wx <- t(w) %*% x s <- svd(wx) cat("R-squared =",format(sum(s$d^2)/sum(x*x)),"\n") w <- w %*% s$u rownames(w) <- rownames(x) colnames(w) <- paste("h",1:ncol(w),sep="") standardize.projection(w) } projection <- function(x,y,k=1,given=NULL,type=c("mv","m","v","r","lm"),...) { if(ncol(x) > 500) stop("data has too many dimensions") if(missing(y)) { resp <- response.var(x) y <- x[[resp]] pred <- predictor.vars(x) x <- x[pred] } if(missing(type)) { if(is.factor(y)) type <- "mv" else type <- "r" cat("Projection type",type,"\n") } else type <- match.arg(type) if(type == "lm") { w <- projection.lm(x,y,k,given,...) } else if(type == "r") { w <- projection.reg(x,y,k,given,...) } else { xs <- split(x,y) vs <- lapply(xs,ml.cov) ns <- lapply(xs,nrow) n <- nrow(x) va <- ml.cov(x) fun <- switch(type, m=projection.m.stats, v=projection.cov.stats, mv=projection.mv.stats) w <- fun(va,vs,ns,k,given,...) } rownames(w) <- names(x) colnames(w) <- paste("h",1:ncol(w),sep="") standardize.projection(w) } # uses linear regression to get projection projection.lm <- function(x,y,k=1,given=NULL) { vxy <- cov(x,y) vxx <- cov(x) e <- eigen2(vxy %*% t(vxy),vxx,given) w <- e$vectors[,1:k] if(k == 1) dim(w) <- c(length(w),1) if(is.null(given)) w else cbind(given,w) } # returns a projection which separates x values leading to different y's # treats it as a classification problem with n classes projection.stats.reg <- function(x,y,span=NULL,...) { # estimate covariance matrix about each point if(is.null(span)) { span <- min(c(5*ncol(x),length(y)/2)) #cat("projection.reg: using nearest ",span," responses\n") } else { span <- span*length(y) } vs <- list() for(i in 1:length(y)) { d <- abs(y - y[i]) j <- order(d) # include all points whose distance is within that of last neighbor j <- which(d <= d[j[span]] + eps) #cat(i,"has",length(j),"neighbors\n") vs[[i]] <- ml.cov(x[j,]) } ns <- as.list(rep(1,length(y))) va <- ml.cov(x) list(va,vs,ns) } projection.reg <- function(x,y,k=1,given=NULL,...) { s <- projection.stats.reg(x,y,...) va <- s[[1]] vs <- s[[2]] ns <- s[[3]] projection.mv.stats(va,vs,ns,k,given) } # returns a projection which separates the class means projection.m.stats <- function(va,vs,ns,k=1,given=NULL) { n <- accumulate(ns) vw <- accumulate(zip(vs,ns))/n e <- eigen2(va,vw,given) i <- rev(order(e$values))[1:k] w <- e$vectors[,i] if(k == 1) dim(w) <- c(length(w),1) if(is.null(given)) w else cbind(given,w) } # returns a projection which separates the class covariances projection.cov.stats <- function(va,vs,ns,k=1,given=NULL) { if(length(vs) > 2) { n <- accumulate(ns) va <- accumulate(zip(vs,ns))/n return(projection.mv.stats(va,vs,ns,k,given)) } v1 <- vs[[1]] v2 <- vs[[2]] n <- accumulate(ns) a <- ns[[1]]/n if(T) { tim <- system.time(e <- eigen2(v1,v2,given))[3] #cat("time to compute",nrow(v1),"eigenvectors:",tim,"\n") } else { tim1 <- system.time(e1 <- eigen2.top(v1,v2,k))[3] tim2 <- system.time(e2 <- eigen2.top(v2,v1,k))[3] #cat("time to compute",2*k,"eigenvectors:",tim1+tim2,"\n") e <- list(values=c(e1$values,1/e2$values),vectors=cbind(e1$vectors,e2$vectors)) } lambda <- abs(e$values) J <- a*log(lambda) - log(a*lambda + 1-a) # keep eigenvectors with smallest J i <- order(J)[1:k] w <- e$vectors[,i] if(k == 1) dim(w) <- c(length(w),1) if(is.null(given)) w else cbind(given,w) } score.mv <- function(w,va,vs,ns) { n <- accumulate(ns) J <- logdet(t(w) %*% va %*% w) for(i in 1:length(vs)) { J <- J - ns[[i]]/n*logdet(t(w) %*% vs[[i]] %*% w) } J } score.m <- function(w,va,vs,ns) { n <- accumulate(ns) J <- logdet(t(w) %*% va %*% w) avg.v <- array(0,dim(va)) for(i in 1:length(vs)) { avg.v <- avg.v + (ns[[i]]/n)*vs[[i]] } J - logdet(t(w) %*% avg.v %*% w) } vec <- function(x) { if(is.list(x)) array(unlist(x),c(prod(dim(x[[1]])),length(x))) else array(as.vector(x),c(length(x),1)) } # returns a projection which separates the class means and covariances projection.mv.stats <- function(va,vs,ns,k=1,given=NULL) { n <- accumulate(ns) if(k > 1) { # initialize with greedy solution # find projections one at a time w <- projection.mv.stats(va,vs,ns,k=1,given) for(i in 1:(k-1)) { w <- projection.mv.stats(va,vs,ns,k=1,given=w) } return(w) } else { # initialize with best of m,v w1 <- projection.m.stats(va,vs,ns,k=1,given) w1 <- w1[,ncol(w1),drop=F] J1 <- score.mv(w1,va,vs,ns) if(length(vs) == 2) { w2 <- projection.cov.stats(va,vs,ns,k=1,given) w2 <- w2[,ncol(w2),drop=F] J2 <- score.mv(w2,va,vs,ns) } else J2 <- 0 if(J2 > J1) w <- w2 else w <- w1 } ns <- unlist(ns) d <- nrow(va) if(k == 1) vec.vs <- vec(vs) # EM optimization for(iter in 1:100) { oldw <- w vb <- array(0,dim(va)) if(k > 1) { # slow for(i in 1:length(vs)) { wv <- det(t(w) %*% vs[[i]] %*% w)^(1/k) vb <- vb + (ns[[i]]/n/wv)*vs[[i]] } } else { # fast for k == 1 wv <- t(w %x% w) %*% vec.vs vb <- vec.vs %*% vec(ns/n/wv) dim(vb) <- c(d,d) } if(F) { # check that objective always increases sw <- if(is.null(given)) w else cbind(given,w) print(score.mv(sw,va,vs,ns)) } w <- eigen2(va,vb,given,k=k)$vector # force vectors to be orthogonal if(k > 1) w <- qr.Q(qr(w)) if(max(abs(w - oldw)) < 1e-5) break } #cat("projection.mv: ",iter," EM iterations\n") if(is.null(given)) w else cbind(given,w) } color.plot.project.glm <- function(fit,data,type="mv",col=3,coef=F,lwd=2,...) { x <- model.frame(fit) fw <- coef(fit)[-1] a <- coef(fit)[1]/norm(fw) fw <- fw/norm(fw) w <- projection(x,k=1,given=fw,type=type) w[,1] = fw px = project(if(missing(data)) x else data, w) color.plot(px,...) abline(v=-a,col=col,lwd=lwd) invisible(w) } plot.axes <- function(w,col=2,origin,keep=NULL,cex=par("cex")) { if(is.null(keep)) keep <- 0.2 usr <- par("usr") # is the origin in the plot? if(missing(origin)) { origin <- (usr[1] < 0) && (usr[2] > 0) && (usr[3] < 0) && (usr[4] > 0) } if(origin) m <- c(0,0) else m <- c((usr[2]+usr[1])/2, (usr[4]+usr[3])/2) width <- strwidth(rownames(w),cex=cex)/2 height <- strheight(rownames(w),cex=cex)/2 # "a" is scale factor to place text at arrow tips a <- pmin(width/abs(w[,1]), height/abs(w[,2])) # txt.w is the offset of the text from the arrow tip txt.w <- w * rep.col(a,2) xlim <- usr[1:2]-m[1] # make room on left xlim[1] <- xlim[1] + diff(xlim)/par("pin")[1]*0.02 xscale <- c() # find biggest xscale so that # xscale*w[,1] + txt.w[,1] - width > xlim[1] (w < 0) # xscale*w[,1] + txt.w[,1] + width < xlim[2] (w > 0) # ignoring constraints which cannot be satisfied i <- (w[,1] < 0) xscale[1] <- min((xlim[1] + width[i] - txt.w[i,1])/w[i,1]) i <- (w[,1] > 0) xscale[2] <- min((xlim[2] - width[i] - txt.w[i,1])/w[i,1]) # if one of the sets is empty, the scale will be NA xscale <- min(xscale,na.rm=T) ylim <- usr[3:4]-m[2] # make room for color key ylim[2] <- ylim[2] - diff(ylim)/par("pin")[2]*0.06 yscale <- c() # find yscale so that # yscale*w[,2] + txt.w[,2] - height > ylim[1] (w < 0) i <- (w[,2] < 0) yscale[1] <- min((ylim[1] + height[i] - txt.w[i,2])/w[i,2]) i <- (w[,2] > 0) yscale[2] <- min((ylim[2] - height[i] - txt.w[i,2])/w[i,2]) yscale <- min(yscale,na.rm=T) # scale equally xscale <- min(c(xscale,yscale)) if(xscale < 0) stop("cannot fit axis labels on the plot. try origin=F") yscale <- xscale w <- scale(w,center=F,scale=1/c(xscale,yscale)) if(T) { # only plot arrows of significant length i <- (abs(w[,1])/(xlim[2]-xlim[1])*par("fin")[1] > keep) | (abs(w[,2])/(ylim[2]-ylim[1])*par("fin")[2] > keep) if(any(!i)) cat("omitting arrow for",rownames(w)[!i],"\n") } if(!any(i)) { warning("all arrows omitted"); return() } len <- min(par("pin"))/20 arrows(m[1], m[2], m[1]+w[i,1], m[2]+w[i,2], col=col, len=len) w <- scale(w,center=-m,scale=F) w <- w + txt.w # note: rownames(w[i,]) does not work if i picks only one text(w[i,1],w[i,2],labels=rownames(w)[i],col=col,cex=cex) } identify.data.frame <- function(x,...) { resp = response.var(x) pred = predictor.vars(x)[1] identify(x[,pred],x[,resp],labels=rownames(x),...) } identify.formula <- function(fmla,data=parent.frame(),...) { x = model.frame(fmla,data) identify.data.frame(x,...) } scale.data.frame <- function(x,...) { i <- sapply(x,is.numeric) x[i] <- data.frame(scale(data.matrix(x[i]),...)) x } ############################################################################# factor.dichotomy <- function(f,lev) { labels <- c(lev,paste("NOT",lev)) fd <- rep(labels[2],length(f)) fd[f == lev] <- labels[1] fd <- factor(fd,levels=labels) names(fd) <- names(f) fd } factor.isolate <- function(f,i) { nam <- names(f) f <- as.character(f) names(f) <- nam f[i] <- i factor(f) } separate.level <- function(x,lev,type="mv",layout,keep=NULL,identify.flag=F,axes=F,...) { # one vs rest # only makes sense if length(lev) > 2 resp <- response.var(x) y <- x[[resp]] if(missing(lev)) lev <- levels(y) if(missing(layout)) layout <- auto.layout(length(lev)) # must do this or mfrow will clobber cex (bug in par?) wpar <- par(cex=par("cex")) opar <- par(mfrow=layout) par(wpar) on.exit(par(opar,wpar)) for(k in 1:length(lev)) { x[[resp]] <- factor.dichotomy(y,lev[k]) w <- projection(x,k=2,type=type) px <- project(x,w) color.plot(px,axes=axes,...) plot.axes(w,origin=F,keep=keep,col=3) if(identify.flag) { h <- identify(px,plot=F) if(length(h) > 0) text(px[h,1],px[h,2],rownames(px)[h],cex=0.9) } } } separate.pairwise <- function(x,level,type="mv",layout,identify.flag=F,axes=F,...) { # level vs each resp <- response.var(x) y <- x[[resp]] lev <- setdiff(levels(y),level) if(missing(layout)) layout <- auto.layout(length(lev)) opar <- par(mfrow=layout) on.exit(par(opar)) for(k in 1:length(lev)) { i <- (y == lev[k]) | (y == level) xk <- x[i,] xk[[resp]] <- factor(xk[[resp]],levels=c(level,lev[k])) w <- projection(xk,k=2,type=type) px <- project(xk,w) color.plot(px,axes=axes,...) plot.axes(w,origin=F,col=3) if(identify.flag) { h <- identify(px,plot=F) if(length(h) > 0) text(px[h,1],px[h,2],rownames(px)[h],cex=0.9) } } } logdet <- function(x) { 2*sum(log(diag(chol(x)))) } separation <- function(x,type="mv") { x <- as.data.frame(x) y <- x[[response.var(x)]] xp <- x[predictor.vars(x)] xs <- split(xp,y) vs <- lapply(xs,ml.cov) ns <- lapply(xs,nrow) n <- nrow(x) if(T) { va <- ml.cov(xp) J <- logdet(va) for(i in 1:length(ns)) { J <- J - ns[[i]]/n*logdet(vs[[i]]) } } else { # loop pairs r <- nchoosek(length(xs),2) J <- 0 for(i in 1:length(r)) { ri <- r[[i]] r1 <- ri[1] r2 <- ri[2] n1 <- ns[[r1]] n2 <- ns[[r2]] v1 <- vs[[r1]] v2 <- vs[[r2]] if(type == "m") { v1 <- (n1*v1 + n2*v2)/(n1+n2) v2 <- v1 } v12 <- ml.cov(rbind(xs[[r1]],xs[[r2]])) Ji <- n1*logdet(v1) + n2*logdet(v2) - (n1+n2)*logdet(v12) J <- J - exp(Ji/2/(n1+n2)) } } J } nchoosek <- function(n,k) { if(k == 1) return(as.list(1:n)) r <- list() for(i in k:n) { r <- c(r,lapply(nchoosek(i-1,k-1), function(x) c(i,x))) } r } simple.projection <- function(d,i) { if(length(d) == 1) { d.names <- as.character(1:d) } else { d.names <- d d <- length(d) } w <- array(0,c(length(d.names),length(i)),list(d.names,i)) for(j in 1:ncol(w)) w[i[j],j] <- 1 w } score.simple.stats <- function(va,vs,ns,k=1,...) { pred <- colnames(va) f <- nchoosek(length(pred),k) r <- empty.data.frame(c(paste("Var",1:k,sep=""),"Sep")) for(i in 1:length(f)) { p <- pred[f[[i]]] w <- simple.projection(pred,p) ri <- c(as.list(p), list(score.mv(w,va,vs,ns))) names(ri) <- names(r) r <- append.data.frame(r,data.frame(ri)) } sort.data.frame(r,"Sep") } # returns a frame of scores for each feature combination score.features <- function(x,y,type=c("mv","m","v","r","lm"),...) { x <- as.data.frame(x) if(missing(y)) { resp <- response.var(x) y <- x[[resp]] pred <- predictor.vars(x) x <- x[pred] } if(missing(type)) { if(is.factor(y)) type <- "mv" else type <- "r" } else type <- match.arg(type) if(type == "r") { s <- projection.stats.reg(x,y,...) va <- s[[1]] vs <- s[[2]] ns <- s[[3]] } else { xs <- split(x,y) vs <- lapply(xs,ml.cov) ns <- lapply(xs,nrow) va <- ml.cov(x) } score.simple.stats(va,vs,ns,...) } top.features <- function(x,r,m=4,layout,type="mv",...) { #r <- score.features(x,k) k <- ncol(r)-1 pred <- predictor.vars(x) if(missing(layout)) layout <- auto.layout(m) opar <- par(mfrow=layout) on.exit(par(opar)) r <- r[nrow(r)+1-(1:m),] for(i in 1:m) { names <- as.vector(as.matrix(r[i,1:k])) w <- simple.projection(pred,names) xw <- project(x,w) if(k > 2) { w <- projection(xw,k=2,type=type) xw <- project(xw,w) } color.plot(xw,...) if(k > 2) plot.axes(w) } } ############################################################################# outliers <- function(x,p=0.05,names=T) { x <- x[sapply(x,is.numeric)] if(ncol(x) >= nrow(x)) { warning("more variables than cases") v <- diag(sd(x)^2) } else v <- cov(x) D2 <- mahalanobis(x,mean(x),v) pv <- 1-pchisq(D2,df=ncol(x)) r <- data.frame(D2=D2,"p.value"=pv,row.names=rownames(x)) r <- sort.data.frame(r) i <- (r$p.value < p) if(T) { #qqplot(qchisq(ppoints(length(D2)), df=ncol(x)), D2) if(any(i)) print(r[i,]) } nam <- rownames(r)[i] if(names) nam else match(nam,rownames(x)) } separate <- function(x,i,axes=F,...) { f <- factor.dichotomy(rownames(x),i) sx <- scale(as.data.frame(x)) w <- projection(sx,f,k=2,type="m") color.plot(project(sx,w),f,axes=axes,...) plot.axes(w,col=3) sort(w[,1]) } residuals.pca <- function(x,w) { pred <- rownames(w) y <- x[pred] m <- apply(y,2,mean) y <- sweep(y,2,m) y <- data.frame((data.matrix(y) %*% w) %*% t(w)) y <- sweep(y,2,-m) x[pred] <- x[pred] - y x } ############################################################################# prototypes <- function(x,y=NULL,fun=mean) { # basically a cleaned-up version of "aggregate" if(!is.null(y) && mode(y) == "function") { fun <- y y <- NULL } if(is.null(y)) { resp <- response.var(x) y <- x[[resp]] if(!is.factor(y)) stop("no class variable") x <- not(x,resp) } else { resp <- "Group.1" } if(identical(fun,sum)) { # fast implementation of special case as.data.frame(indicators.factor(y) %*% as.matrix(x)) } else if(identical(fun,mean)) { # fast implementation of special case f <- indicators.factor(y) f <- scale.rows(f, 1/rowSums(f)) as.data.frame(f %*% as.matrix(x)) } else if(T) { # faster than below xs <- split(as.data.frame(x),y) xp <- sapply(xs,function(xl) apply(xl,2,fun)) as.data.frame(t(xp)) } else { yl <- list(y) names(yl) <- resp x <- aggregate(x,yl,fun) rownames(x) <- levels(y) x[setdiff(names(x),resp)] } } hclust.prototypes <- function(x,...) { resp <- response.var(x) px <- not(prototypes(x),resp) hclust(dist(px)^2,members=table(x[[resp]]),...) } reorder.data.frame <- function(y,dims=c(1,2)) { i <- sapply(y,is.numeric) x <- y[i] s <- svd(scale(x)) if(dims == c(1,2)) x <- x[order(s$u[,1]),order(s$v[,1])] else if(dims == 1) x <- x[order(s$u[,1]),] else if(dims == 2) x <- x[,order(s$v[,1])] if(any(!i)) x <- cbind(x, y[!i]) x } image.data.frame <- function(x,...) { x <- apply.df(x,scale.range) image.table(as.matrix(x),...) } star.plot <- function(x,draw.segments=T,proto=T,...) { resp <- response.var(x) if(is.factor(x[[resp]]) && proto) { cat("showing median of each",resp,"\n") f <- x[resp] x <- x[sapply(x,is.numeric)] x <- aggregate(x,f,median) rownames(x) <- levels(f[[1]]) } x <- x[sapply(x,is.numeric)] x <- reorder.data.frame(x) opar <- par(mar=c(0.05,0,0,0.1)) on.exit(par(opar)) stars(x, draw.segments=draw.segments, ...) } parallel.plot <- function(x,yscale=c("linear","range","none"), xscale=c("equal","linear","none"),proto=T,...) { x <- as.data.frame(x) resp <- response.var(x) if(is.factor(x[[resp]])) { if(proto) { cat("showing median of each",resp,"\n") f <- x[resp] x <- x[sapply(x,is.numeric)] x <- aggregate(x,f,median) rownames(x) <- levels(factor(f[[1]])) q <- 1:nrow(x) } else { # plot cases, but color by class q <- as.numeric(factor(x[[resp]])) } x <- not(x,resp) } else { q <- 1:nrow(x) } y <- x # assume no factors y <- data.matrix(y) x <- 1:ncol(y) yscale <- match.arg(yscale) xscale <- match.arg(xscale) if(yscale=="linear") { # do this to subtract means y <- scale(y) if(xscale != "none") { # Hartigan's linear profile algorithm # want yij*sj = ai + bi*xj # yij = ai*(1/sj) + bi*(xj/sj) # so 1/sj = v[j,1], xj/sj = v[j,2] # scaling by singular values is irrelevant s <- svd(y) x <- s$v[,2]/s$v[,1] s <- 1/s$v[,1] if(xscale == "equal") { x <- rank.stable(x) # now act as if x was fixed and solve for scaling again xscale <- "none" } } if(xscale=="none") { # solve for sj when xj is known # this comes from solving for (ai,bi) analytically and substituting # to get eigenvector problem for sj d <- ncol(y) xz <- x - mean(x) a <- diag(d) - array(1/d,c(d,d)) - outer(xz,xz)/sum(xz*xz) # Hadamard product a <- (a %*% a) * (t(y) %*% y) s <- svd(a) # want the smallest eigenvector of a s <- s$u[,d] } # minimize number of reversals if(sum(s<0) > sum(s>0)) s <- -s y <- y * rep.row(s,nrow(y)) i <- s<0 if(any(i)) { cat("axis reversed for",colnames(y)[i],"\n") colnames(y)[i] <- sapply(colnames(y)[i], function(s) paste("-",s,sep="")) } } else { if(yscale == "range") { # map to [0,1] y <- y - rep.row(apply(y,2,min),nrow(y)) y <- y / rep.row(apply(y,2,max),nrow(y)) } if(xscale != "none") { # want yij = ai + bi*xj # subtract mean in each row to remove ai, # then xj is top eigenvector s <- svd(y - rep.col(apply(y,1,mean),ncol(y))) x <- s$v[,1] if(xscale == "equal") x <- rank.stable(x) } } # remove x flip ambiguity by alphabetic sort if(colnames(y)[which.min(x)] > colnames(y)[which.max(x)]) { x <- -x } i <- order(x) y <- reorder.cols(y,i) x <- x[i] labeled.curves(x,y,xlab="",ylab="",yaxt=if(yscale=="none") "s" else "n",...) } ############################################################################# # abbreviate strings so that max width is w, and strings remain unique abbreviate.width <- function(s,w,cex=par("cex")) { old.max <- w repeat { wid <- strwidth(s,cex=cex) if(max(wid)<=w) break i <- wid > w j <- which.max(wid) # this may cause a collision s[i] <- abbreviate(s[i],nchar(s[j])-1) } s } # modified by Tom Minka # abbreviation, better spacing, better handling of key stars <- function(x, full = TRUE, scale = TRUE, radius = TRUE, labels = dimnames(x)[[1]], locations = NULL, xlim = NULL, ylim = NULL, len = 1, colors = NULL, abbrev=T, key.loc = NA, key.labels = NULL, key.xpd = TRUE, draw.segments = FALSE, axes = FALSE, cex = 0.8, lwd = 0.25, ...) { if (is.data.frame(x)) x <- as.matrix(x) else if (!is.matrix(x)) stop("x must be a matrix or a data frame") if (!is.numeric(x)) stop("data in x must be numeric") n.loc <- nrow(x) n.seg <- ncol(x) deg <- pi / 180 # segments only seg.colors <- if(!is.null(colors)) colors else 1:n.seg if (is.null(locations)) { # make loc matrix md <- ceiling(sqrt(n.loc)) # => md^2 >= n.loc loc.x <- 2.1*((1:md)-1) loc.y <- rev(loc.x) # minka: adjust loc for labels if(!is.null(labels)) { # technically, strheight doesn't work until we've made axes spc <- max(strheight(labels,cex=cex)) loc.y <- rev((2.1+spc)*((1:md)-1)) } loc <- expand.grid(loc.x, loc.y)[1:n.loc, ] } else { if (is.numeric(locations) && length(locations) == 2) { ## all stars around the same origin loc <- cbind(rep(locations[1],n.loc), rep(locations[2],n.loc)) if(!missing(labels) && n.loc > 1) warning("labels don't make sense for a single location") else labels <- NULL } else { if (is.data.frame(locations)) locations <- data.matrix(locations) if (!is.matrix(locations) || ncol(locations) != 2) stop("locations must be a 2-column matrix.") loc <- .Alias(locations) if (n.loc != nrow(loc)) stop("number of rows of locations and x must be equal.") } } ## Angles start at zero and pace around the circle counter ## clock-wise in equal increments. angles <- if(full) seq(0, 2*pi, length=n.seg+1)[-(n.seg+1)] else if (draw.segments) seq(0, pi, length=n.seg+1)[-(n.seg+1)] else seq(0, pi, length=n.seg) if (length(angles) != n.seg) stop("length(angles) must be the same as ncol(x)") ## Missing values are treated as 0 x[is.na(x)] <- 0 if (scale) { # minka: use full range of [0.1,1] x <- sweep(x,2,apply(x,2,min),FUN="-") x <- sweep(x,2,apply(x,2,max)/0.9,FUN="/") x <- sweep(x,2,0.1,FUN="+") ## Columns of 0s will put NAs in x, next line gets rid of them x[is.na(x)] <- 0 # sqrt because we are depicting area x = sqrt(x) } x <- x * len # minka: place key automatically at top if(!is.null(key.loc) && is.na(key.loc)) { key.loc <- c(mean(loc[[1]]), max(loc[[2]]) + 2.5) } if(is.null(key.labels)) key.labels <- dimnames(x)[[2]] if(is.null(xlim)) { xlim <- range(loc[,1] + len, loc[,1] - len) if(!is.null(key.loc)) { w <- max(strwidth(key.labels,cex=cex)) xlim[1] <- min(xlim[1],key.loc[1]-1-w) xlim[2] <- max(xlim[2],key.loc[1]+1+w) } } if(is.null(ylim)) { ylim <- range(loc[,2] + len, loc[,2] - len) if(!is.null(labels)) { spc <- max(strheight(labels,cex=cex)) ylim[1] <- min(loc[,2] - 1 - spc) } if(!is.null(key.loc)) { h <- max(strheight(key.labels,cex=cex)) ylim[1] <- min(ylim[1],key.loc[2]-1-h) ylim[2] <- max(ylim[2],key.loc[2]+1+h) } } ## The asp argument keeps everything square plot(0, type="n", ..., xlim=xlim, ylim=ylim, xlab="", ylab="", asp = 1, axes = axes) # minka: abbreviate labels if necessary if(!is.null(labels) && abbrev) { labels <- abbreviate.width(labels,2,cex=cex) } if ( draw.segments ) { ## for each location, draw a segment diagram for ( i in 1:n.loc ) { poly.x <- NA poly.y <- NA start.x <- x[i,] * cos( angles ) + loc[i,1] start.y <- x[i,] * sin( angles ) + loc[i,2] ### FIXME : we can do without the following inner loop ! for (j in 1:n.seg) { poly.x <- c(poly.x,loc[i,1],start.x[j]) poly.y <- c(poly.y,loc[i,2],start.y[j]) next.angle <- if(j < n.seg) angles[j+1] else (if(full) 360 else 180) * deg k <- seq(from = angles[j], to = next.angle, by = deg) poly.x <- c(poly.x, x[i,j] * cos( k ) + loc[i,1], NA) poly.y <- c(poly.y, x[i,j] * sin( k ) + loc[i,2], NA) } polygon(poly.x, poly.y, col = seg.colors, lwd=lwd) if (!is.null(labels)) text(loc[i,1], loc[i,2] - if(full)max(x) else 0.1 * max(x), labels[i], cex=cex, adj=c(0.5,1)) } } # Segment diagrams else { # Draw stars instead for ( i in 1:n.loc ) { temp.x <- x[i,] * cos( angles ) + loc[i,1] temp.y <- x[i,] * sin( angles ) + loc[i,2] if (radius) segments(rep(loc[i,1],n.seg), rep(loc[i,2],n.seg), temp.x, temp.y, lwd=lwd) lines(c(temp.x, temp.x[1]), c(temp.y, temp.y[1]), lwd=lwd) if (!is.null(labels)) text(loc[i,1], loc[i,2] - if(full)max(x) else 0.1 * max(x), labels[i], cex=cex, adj=c(0.5,1)) } } if ( !is.null(key.loc) ) { ## Draw unit key ## allow drawing outside plot region (inside figure region): op <- par(xpd = key.xpd); on.exit(par(op)) key.r <- len * 0.8 key.x.coord <- cos( angles ) * key.r + key.loc[1] key.y.coord <- sin( angles ) * key.r + key.loc[2] if ( draw.segments ) { key.x <- NA key.y <- NA for (j in 1:n.seg){ key.x <- c(key.x,key.loc[1],key.x.coord[j]) key.y <- c(key.y,key.loc[2],key.y.coord[j]) k <- angles[j] + deg next.angle <- if (j < n.seg) angles[j+1] else (if(full) 360 else 180) * deg while (k < next.angle) { key.x <- c(key.x, key.r * cos( k ) + key.loc[1]) key.y <- c(key.y, key.r * sin( k ) + key.loc[2]) k <- k + deg } key.x <- c(key.x, key.r * cos( next.angle ) + key.loc[1], NA) key.y <- c(key.y, key.r * sin( next.angle ) + key.loc[2], NA) } polygon(key.x, key.y, col = seg.colors, lwd=lwd) } else { # draw unit star if ( radius ) segments(rep(key.loc[1],n.seg), rep(key.loc[2],n.seg), key.x.coord, key.y.coord, lwd=lwd) lines(c(key.x.coord, key.x.coord[1]), c(key.y.coord, key.y.coord[1]), lwd=lwd) } lab.angl <- angles + if(draw.segments) (angles[2] - angles[1]) / 2 else 0 label.r <- len*0.9 label.x <- cos( lab.angl ) * label.r + key.loc[1] label.y <- sin( lab.angl ) * label.r + key.loc[2] ##-- FIXME : Do the following witout loop ! for (k in 1:n.seg) { text.adj <- c(## horizontal if (lab.angl[k] < 90*deg || lab.angl[k] > 270*deg) 0 else if (lab.angl[k] > 90*deg && lab.angl[k] < 270*deg) 1 else 0.5, ## vertical if (lab.angl[k] <= 90*deg) (1 - lab.angl[k] / (90*deg)) /2 else if (lab.angl[k] <= 270*deg) (lab.angl[k] - 90*deg) / (180*deg) else ## lab.angl[k] > 270*deg 1 - (lab.angl[k] - 270*deg) / (180*deg) ) text(label.x[k], label.y[k], labels= key.labels[k], cex = cex, adj = text.adj) } } # Unit key is drawn and labelled invisible() } qp = function(A,b,tol=1e-10) { # returns the vector x which minimizes x'x subject to Ax=b. #library(MASS) # standardize the problem s = sqrt(rowSums(A * A)) A = A/rep.col(s,ncol(A)) b = b/s d = ncol(A) x = array(0,c(d,1)) ws = c() for(iter in 1:100) { # find most violated constraint s = (A %*% x) - b # constraints in the working set must be satisfied # this is to avoid problems with numerical inaccuracy s[ws] = 0 i = which.min(s) if(s[i] >= -tol) break if(i %in% ws) { print(s[i]) stop("double violated constraint") } ws = c(ws,i) while(T) { # solve equality-constrained problem Aw = A[ws,,drop=F] bw = b[ws,drop=F] if(T) { y = solve(Aw %*% t(Aw), bw) } else { y = ginv(Aw %*% t(Aw)) %*% bw } x = t(Aw) %*% y if(any(y<=eps)) { # remove y<0 from working set ws = ws[y>eps] } else break } #cat("working set:",ws,"\n",fill=60) } x } # not used collision = function(x,y,w,h) { # returns T iff the first box collides with any of other boxes n = length(x) for(i in 2:n) { if((abs(x[1]-x[i]) < (w[1]+w[i])/2) && (abs(y[1]-y[i]) < (h[1]+h[i])/2)) return(T) } F } find.gap <- function(base=0,h0,y,h,symmetric=T) { # returns the smallest position of a box of size h0 which will not collide # with boxes of size h centered on y. # sort y ord = order(y) y = y[ord] h = h[ord] n = length(y) # compute all gaps dy = diff(y)-(h[1:(n-1)]+h[2:n])/2 # dy must be positive since there are no collisions among y's i = which(dy > h0) # upper end y.new = max(y[n]+(h[n]+h0)/2,base) if(length(i) > 0) { # this isn't quite optimal # lower bound y.new = c(y.new, y[i]+(h[i]+h0)/2) i.next = pmin(i+1,n) # upper bound y.new = c(y.new, y[i.next]-(h[i.next]+h0)/2) } if(symmetric) { # lower end y.new = c(y.new, y[1]-(h[1]+h0)/2) } if(!symmetric) { y.new = y.new[y.new >= base] if(length(y.new) == 0) stop("empty") } y.new[which.min(abs(y.new - base))] } move.collisions.vertical <- function(x,w,h,base=0,symmetric=T) { # find the minimal vertical shifts so that boxes of width w and height h # centered on x do not overlap. n = length(x) if(length(w) == 1) w = rep(w,n) if(length(h) == 1) h = rep(h,n) if(length(base) == 1) base = rep(base,n) y = numeric(n) step = median(h)/4 # loop in reverse order y[n] = base[n] for(i in (n-1):1) { # find nearby boxes (those for which overlap is possible) is = (i+1):n j = is[abs(x[i]-x[is]) < (w[i]+w[is])/2] # now only need to avoid vertical overlap if(length(j) == 0) y[i] = base[i] else y[i] = find.gap(base[i],h[i],y[j],h[j],symmetric) if(y[i] < base[i] && !symmetric) { print(y[i]) print(base[i]) print(h[i]) print(y[j]) print(h[j]) stop("") } } y } move.collisions <- function(x,w) { # move x's the minimal amount so that boxes of width w centered on x # do not overlap. # units of x and w must match (preferably inches). if(length(x) == 1) return(x) if(F) { # jitter x x = x + 2*eps*runif(length(x)) x = x + runif(length(x)) } ord = order(x) x = x[ord] w = w[ord] # construct constraint matrix # constraints are (x[j]+dx[j])-(x[i]+dx[i]) > (w[j]+w[i])/2 # or dx[j]-dx[i] > (w[j]+w[i])/2 -(x[j]-x[i]) # for all j>i n = length(x) nc = n*(n-1)/2 A = array(0,c(nc,n)) b = array(0,c(nc,1)) index = 1 for(i in 1:(n-1)) { for(j in (i+1):n) { A[index,j] = 1 A[index,i] = -1 b[index] = (w[j]+w[i])/2 - (x[j]-x[i]) index = index + 1 } } colnames(A) = names(x) x = x + drop(qp(A,b)) # restore original order x[ord] = x x } move.collisions2 <- function(x,y,w,h,cost) { # move (x,y) the minimal amount so that boxes of width w and height h # centered on (x,y) do not overlap. # units of (x,y) and (w,h) must match (preferably inches). # cost is a vector specifying how immovable each box is. # cost[i] = Inf means box i will not be moved. n = length(x) # construct constraint matrix A # columns are ordered (dx[1:n],dy[1:n]) # constraints are (x[j]+dx[j])-(x[i]+dx[i]) > (w[j]+w[i])/2 # or dx[j]-dx[i] > (w[j]+w[i])/2 -(x[j]-x[i]) # dy[j]-dy[i] > (h[j]+h[i])/2 -(y[j]-y[i]) # each pair j>i has either the dx or dy constraint. nc = n*(n-1)/2 A = array(0,c(nc,2*n)) b = array(0,c(nc,1)) index = 1 for(i in 1:(n-1)) { for(j in (i+1):n) { # find out which constraint is least violated w.spc = (w[j]+w[i])/2 h.spc = (h[j]+h[i])/2 if(abs(x[j]-x[i])-w.spc > abs(y[j]-y[i])-h.spc) { if(x[j] > x[i]) { A[index,j] = 1 A[index,i] = -1 b[index] = w.spc - (x[j]-x[i]) } else { A[index,i] = 1 A[index,j] = -1 b[index] = w.spc - (x[i]-x[j]) } } else { if(y[j] > y[i]) { A[index,j+n] = 1 A[index,i+n] = -1 b[index] = h.spc - (y[j]-y[i]) } else { A[index,i+n] = 1 A[index,j+n] = -1 b[index] = h.spc - (y[i]-y[j]) } } index = index + 1 } } if(!missing(cost)) A = scale.cols(A,rep(1/cost,2)) #colnames(A) = c(names(x),names(y)) r = drop(qp(A,b)) if(!missing(cost)) r = r / rep(cost,2) x = x + r[1:n] y = y + r[(1:n)+n] cbind(x,y) } # modified from rw1030/library/base/R/base/read.ftable read.array <- function(file, sep = "", quote = "\"", skip = 0) { z <- count.fields(file, sep, quote, skip) if(z[2] != max(z)) stop("unknown array representation") if(z[1] == 1) { ## Variable names and levels. File looks like ## cvar.nam ## rvar.1.nam ... rvar.k.nam cvar.lev.1 ... cvar.lev.l ## rvar.1.lev.1 ... rvar.k.lev.1 ... ... ... ## # read the header n.col.vars <- 1 col.vars <- vector("list", length = n.col.vars) s <- scan(file, what = "", sep = sep, quote = quote, nlines = 2, skip = skip, quiet = TRUE) names(col.vars) <- s[1] s <- s[-1] n.row.vars <- z[max(which(z == max(z)))] - z[length(z)] + 1 row.vars <- vector("list", length = n.row.vars) i <- 1:n.row.vars names(row.vars) <- s[i] col.vars[[1]] <- s[-i] z <- z[3:length(z)] skip <- skip + 2 } else if(z[1] == min(z[2:length(z)])-1) { ## Variable levels, no names. File looks like ## cvar.lev.1 ... cvar.lev.l ## rvar.1.lev.1 ... rvar.k.lev.1 ... ... ... n.col.vars <- 1 col.vars <- vector("list", length = n.col.vars) s <- scan(file, what = "", sep = sep, quote = quote, nlines = 1, skip = skip, quiet = TRUE) col.vars[[1]] <- s n.row.vars <- z[max(which(z == max(z)))] - z[length(z)] + 1 row.vars <- vector("list", length = n.row.vars) z <- z[2:length(z)] skip <- skip + 1 } else if(all(z == max(z))) { ## Just a block of numbers. values <- scan(file, sep = sep, quote = quote, quiet = TRUE, skip = skip) dim(values) <- c(z[1],length(z)) return(t(values)) } else { stop("unknown array representation") } # read the rest of the file s <- scan(file, what = "", sep = sep, quote = quote, quiet = TRUE, skip = skip) # select the data portion is.row.lab <- rep(rep(c(TRUE, FALSE), length(z)), c(rbind(z - min(z) + 1, min(z) - 1))) values <- as.numeric(s[!is.row.lab]) # label portion tmp <- s[is.row.lab] len <- length(tmp) p <- 1 n <- integer(n.row.vars) for(k in seq(from = 1, to = n.row.vars)) { # n[k] is the cardinality of variable k n[k] <- sum(z >= max(z) - k + 1) / p # minka: added this line p <- n[k] # i indexes the lines i <- seq(from = 1, to = len, by = len / n[k]) row.vars[[k]] <- unique(tmp[i]) tmp <- tmp[seq(from = 2, to = len / n[k])] len <- length(tmp) } # minka: transposed from R version dim(values) <- c(sapply(col.vars, length),sapply(rev(row.vars), length)) dimnames(values) <- c(col.vars,rev(row.vars)) aperm(values) } # from rw1030/library/base/R/base/write.ftable # modified to accept arrays write.array <- function(x, file = "", quote = TRUE, digits = getOption("digits")) { ox <- x charQuote <- function(s) { if(length(s) > 1) sapply(s,charQuote) else { if(!is.null(s) && length(grep(" ",s))>0) paste("\"", s, "\"", sep = "") else s } } makeLabels <- function(lst) { lens <- sapply(lst, length) cplensU <- c(1, cumprod(lens)) cplensD <- rev(c(1, cumprod(rev(lens)))) y <- NULL for (i in rev(seq(along = lst))) { ind <- 1 + seq(from = 0, to = lens[i] - 1) * cplensD[i + 1] tmp <- character(length = cplensD[i]) tmp[ind] <- charQuote(lst[[i]]) y <- cbind(rep(tmp, times = cplensU[i]), y) } y } dn <- dimnames(x) if(is.null(dn)) { x <- format(unclass(x),digits=digits) cat(t(x), file = file, sep = c(rep(" ", ncol(x) - 1), "\n")) return(invisible(ox)) } xrv <- dn[1:(length(dn)-1)] xcv <- dn[length(dn)] LABS <- rbind(matrix("", nr = length(xcv), nc = length(xrv)), charQuote(names(xrv)), makeLabels(xrv)) x <- aperm(x) dim(x) <- c(prod(sapply(xcv,length)),prod(sapply(xrv,length))) x <- t(x) DATA <- rbind(t(makeLabels(xcv)), format(unclass(x), digits = digits)) if(!is.null(names(xcv))) { DATA <- rbind(c(charQuote(names(xcv)),rep("",ncol(x)-1)),DATA) } x <- cbind(apply(LABS, 2, format, justify = "left"), apply(DATA, 2, format, justify = "right")) cat(t(x), file = file, sep = c(rep(" ", ncol(x) - 1), "\n")) invisible(ox) } # extra routines for linear regression # Tom Minka panel.hist <- function(x, ...) { # helper function for pairs(), from examples # usage: pairs(x,diag.panel=panel.hist) # put histograms on the diagonal usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, ...) } # default color for partial residuals formals(termplot)$col.res <- 1 # default lowess span #formals(panel.smooth)$span <- 4/5 #formals(lowess)$f <- 3/4 # returns a model frame where the responses have been replaced by their # residuals under the fit residual.frame <- function(object,data) { if(missing(data)) { x <- model.frame(object) resp <- response.var(x) if(inherits(object,"glm")) { p <- object$fitted.value x[[resp]] <- (object$y - p)/p/(1-p) } else x[[resp]] <- residuals(object) } else { x <- data resp <- response.var(object) # only for lm, not glm if(inherits(object,"glm")) stop("can't compute residuals for glm") # expand p to include NAs p <- predict(object,x)[rownames(x)] names(p) <- NULL x[[resp]] <- x[[resp]] - p #attr(x,"terms") <- terms(object) } x } partial.residuals <- function(object,omit,exact=T) { resp <- response.var(object) pred <- predictor.vars(object) if(exact) { x <- list() for(i in 1:length(omit)) { fit <- update(object,paste(".~. -",omit[i])) #cat(pred[i],coef(fit),"\n") x[[i]] <- residuals(fit) } names(x) <- omit } else { x <- data.frame(residuals(fit,type="partial")) r = residuals(fit) for(i in 1:length(x)) { names(x[[i]]) = names(r) } } x } partial.residual.frame <- function(object,omit) { x = model.frame(object) resp <- response.var(x) x[[resp]] = partial.residuals(object,omit)[[1]] pred = predictor.vars(formula(paste(resp,"~",omit))) x[c(pred,resp)] } ############################################################################## expand.frame <- function(data,v) { if(length(v) == 0) return(data) vs = strsplit(v,":") for(i in 1:length(v)) { data[[v[[i]]]] <- apply(data[vs[[i]]],1,prod) } data } predict.plot.default <- function(x,y,names.,xlab,ylab,...) { frame = data.frame(x,y) if(!missing(names.)) rownames(frame) = names. if(missing(xlab)) xlab = deparse(substitute(x)) if(missing(ylab)) ylab = deparse(substitute(y)) predict.plot.data.frame(frame,xlab=xlab,ylab=ylab,...) } predict.plot.data.frame <- function(x,given,given.lab,nlevels=2, layout,partial,type=c("prob","logit","probit"), highlight,key,se=T, identify.pred=F, scol="red",slwd=2, span=2/3,degree=2,family="symmetric", mar=NA,bg=par("bg"), xaxt=par("xaxt"),yaxt=par("yaxt"), color.palette=default.colors, asp=NULL, pch.palette=default.pch,main=NULL,xlab,ylab,...) { std.error = function(x) sd(x)/sqrt(length(x)) type <- match.arg(type) resp <- response.var(x) pred <- predictor.terms(x) x <- expand.frame(x,setdiff(pred,predictor.vars(x))) if(!missing(given)) { if(missing(key)) key = T if(!is.factor(given)) { # make given a factor b <- break.quantile(given,nlevels) given <- rcut(given,b) } else nlevels = length(levels(given)) if(is.function(color.palette)) color.palette = color.palette(nlevels) lev = 1:nlevels level.color = color.palette[((lev-1) %% length(color.palette))+1] level.pch <- pch.palette[(((lev-1) %/% length(color.palette)) %% length(pch.palette))+1] } else key = F if(missing(layout)) layout <- auto.layout(length(pred) + key) opar <- par(bg=bg) if(any(layout != c(1,1))) { opar = c(opar,par(mfrow=layout)) } if(!is.null(mar)) { if(is.na(mar)) { mar = auto.mar(main=main,xlab=if(missing(xlab)) "y" else xlab, ylab=if(missing(ylab)) "y" else ylab,xaxt=xaxt,yaxt=yaxt) } opar <- c(opar,par(mar=mar)) } on.exit(par(opar)) if(!missing(partial)) { if(missing(highlight)) highlight <- predictor.terms(partial) if(!missing(given.lab)) { if(given.lab %in% predictor.vars(partial)) { partial = update(partial,paste(".~.-",given.lab)) } } pres <- partial.residuals(partial,pred,exact=T) } else if(missing(highlight)) highlight <- NULL if(missing(ylab)) ylab = resp for(i in pred) { if(!missing(partial)) { # sometimes pres is longer than x, because of NAs x[[resp]] <- pres[[i]][rownames(x)] if(is.null(x[[resp]])) stop(paste("partial of",i,"not found")) } if(is.factor(x[[i]]) && !is.ordered(x[[i]])) { x[[i]] <- sort.levels(x[[i]],as.numeric(x[[resp]])) } # plot points if(missing(given)) { # no given if(!is.na(scol)) { # smooth curve #k <- is.finite(x[,i]) & is.finite(x[[resp]]) # keep warnings about infinite points k <- !is.na(x[,i]) & !is.na(x[[resp]]) fit = NULL if(is.factor(x[[resp]])) { # factor response if(length(levels(x[[resp]]))==2 && !is.factor(x[[i]])) { if(type=="prob") { fit = loprob(x[k,i], x[k,resp]) if(F) { # piecewise constant approx library(tree) frame = data.frame(x=x[k,i],y=x[k,resp]) tr = tree(y~x,frame) partition.tree(tr,add=T,force=F) } } else { fit <- loprob(x[k,i], x[k,resp]) p <- fit$y-1 p <- switch(type,logit=log(p/(1-p)),probit=qnorm(p)) fit$y <- p+1.5 plot(x[[i]],2*as.numeric(x[[resp]])-3,xlab=i,ylab=type) } } } else { if(!is.factor(x[[i]])) { if(!all(is.finite(x[k,i]))) stop(paste(i,"has infinite values.\nCheck your transformation.")) if(!all(is.finite(x[k,resp]))) stop(paste(resp,"has infinite values.\nCheck your transformation.")) if(length(k) > 1500) { # save time on dense plots fit = lowess(x[k,i], x[k,resp], f=span) } else { fit = lowess2(x[k,i], x[k,resp], f=span, degree=degree, family=family) } } } } real.asp = if(identical(asp,"auto")) auto.aspect(fit) else asp if(is.factor(x[[resp]])) { # factor response if(type=="prob") { if(is.factor(x[[i]])) { # should sort if the factor is not ordered mosaicplot(table(x[[i]], x[[resp]]), xlab=i, ylab=ylab, ...) } else { plot(x[[i]], x[[resp]], xlab=i, ylab=ylab, ...) } } } else if(is.factor(x[[i]])) { rt = rtable(formula(paste(resp,"~",i)),x) se = rtable(formula(paste(resp,"~",i)),x,fun=std.error) * 1.64 linechart(rt,se) } else { # numeric response plot(x[[i]], x[[resp]], xlab="", ylab=ylab, main=main, asp=real.asp, ...) if(missing(xlab)) { if(i %in% highlight) title(xlab=i,font.lab=2) else title(xlab=i) } else title(xlab=xlab) if(!missing(partial)) { a <- coef(partial) a0 = a["(Intercept)"] if(is.null(a0)) a0 = 0 #if(!is.na(a[i])) abline(a0,a[i],col=3,lty=2) } } if(!is.null(fit)) lines(fit,col=scol,lwd=slwd) # identify if((identify.pred == T) || (i %in% identify.pred)) { identify(x[k,i],x[k,resp],labels=rownames(x)[k]) } } else if(is.factor(x[[i]])) { fmla = formula(paste(resp,"~",i,"+",given.lab)) x[[given.lab]] = given rt = rtable(fmla,x) if(se) { rt.se = rtable(fmla,x, fun=std.error) * 1.64 rt.se[is.na(rt.se)] = sd(x[[resp]]) linechart(t(rt),t(rt.se),xlab=" ", xaxt=xaxt,yaxt=yaxt,row.labels=NULL,stand=F) } else { linechart(t(rt),xlab=" ", xaxt=xaxt,yaxt=yaxt,row.labels=NULL,stand=F) } font.lab = if(i %in% highlight) 2 else 1 title(xlab=i,font.lab=font.lab) } else { # condition on the given variable plot.new() box() title(ylab=ylab) if(i %in% highlight) title(xlab=i,font.lab=2) else title(xlab=i) if(is.factor(x[[i]])) { # setup window for factor xlim <- length(levels(x[[i]])) plot.window(xlim=c(0.5,xlim+0.5),ylim=range(x[[resp]])) if(xaxt != "n") axis(1,1:xlim,labels=levels(x[[i]])) if(yaxt != "n") axis(2) cat(paste("jittering",i,"\n")) } else { # setup window for numeric plot.window(xlim=range(x[[i]],na.rm=T),ylim=range(x[[resp]],na.rm=T)) if(xaxt != "n") axis(1) if(yaxt != "n") axis(2) } lev <- levels(given) for(g in 1:length(lev)) { color <- level.color[g] pch <- level.pch[g] val <- lev[g] k <- !is.na(x[,i]) & !is.na(x[,resp]) k = k & (given == val) if(is.factor(x[[i]])) { jitter <- (runif(length(x[k,i]))-0.5)/5 points(as.numeric(x[k,i])+jitter, x[k,resp], col=color, pch=pch, ...) } else { points(x[k,i], x[k,resp], col=color, pch=pch, ...) } if(is.factor(x[[resp]])) { fit = loprob(x[k,i], x[k,resp]) } else { fit = lowess2(x[k,i], x[k,resp], f=span, degree=degree) } lines(fit,col=color,lwd=slwd) } } } # show legend for the given variable, as a separate panel # this is better than putting a color key on each plot if(key) { plot.new() if(!missing(given.lab)) { font.lab = if(given.lab %in% highlight) 2 else 1 title(xlab=given.lab,font.lab=font.lab) } lev = levels(given) y <- cumsum(strheight(lev)+0.02) for(i in 1:length(lev)) { color <- color.palette[i] val <- lev[i] text(0.5,0.5+y[i],val,col=color,adj=0.5) if(length(unique(level.pch))>1) points(0.6+strwidth(val)/2,0.5+y[i],pch=level.pch[i]) } } } predict.plot.lm <- function(object,data,partial=F,...) { if(!partial) { if(missing(data)) { res <- residual.frame(object) } else { res <- residual.frame(object,data) } cat("plotting residuals\n") predict.plot.data.frame(res,highlight=predictor.terms(object),...) } else { if(missing(data)) data <- model.frame(object) cat("plotting partial residuals\n") predict.plot.data.frame(data,partial=object,...) } } given.vars <- function(formula) { rhs <- formula[[3]] if(is.call(rhs) && (deparse(rhs[[1]]) == "|")) { # remove givens from formula deparse(rhs[[3]]) } else NULL } remove.given <- function(formula) { rhs <- formula[[3]] formula[[3]] <- rhs[[2]] formula } predict.plot.formula <- function(formula,data=parent.frame(),...) { # formula has givens? given = given.vars(formula) if(!is.null(given)) { formula = remove.given(formula) if(is.environment(data)) g <- get(given,env=data) else g <- data[[given]] if(is.null(g)) stop(paste("variable \"",given,"\" not found",sep="")) return(predict.plot.formula(formula,data, given=g,given.lab=given,...)) } if(F) { expr <- match.call(expand = F) expr$... <- NULL expr$na.action <- na.pass expr[[1]] <- as.name("model.frame.default") x <- eval(expr, parent.frame()) } else { # formula has its own environment x <- model.frame.default(formula,data,na.action=na.pass) } predict.plot.data.frame(x,...) } predict.plot <- function(object, ...) UseMethod("predict.plot") ############################################################################## expand.cross <- function(object) { resp <- response.var(object) pred <- predictor.vars(object) # quadratic terms only pred2 <- c() for(i in pred) { for(j in pred) { if(match(i,pred) < match(j,pred)) pred2 = c(pred2,paste(i,j,sep=":")) } } pred = c(pred,pred2) formula(paste(resp,"~",paste(pred,collapse="+"))) } interact.plot.data.frame <- function(x,ypred,partial,highlight,span=0.75, scol="red",slwd=2,type=c("*",":"), bg=par("bg"),...) { library(modreg) type = match.arg(type) resp <- response.var(x) pred = predictor.vars(x) if(missing(ypred)) ypred = pred n = length(pred) mar=c(0.05,0.05,0,0.05) if(all(ypred == pred)) layout = c(n,n) else layout = auto.layout(n*length(ypred)) opar = par(mfrow=layout,mar=mar,bg=bg) on.exit(par(opar)) if(!missing(partial)) { if(missing(highlight)) highlight <- predictor.terms(partial) } else if(missing(highlight)) highlight <- NULL for(py in ypred) { for(px in pred) { if(px == py) { plot.new() box() y = 0.5 if(is.factor(x[[py]])) { auto.legend(levels(x[[py]]),default.colors(nlevels(x[[py]]))) #y = 0.5 } text(0.5,y,py,font=if(py %in% highlight) 2 else 1) } else if(!is.factor(x[[py]]) && (match(py,pred) < match(px,pred))) { plot.new() } else { y = x[c(px,py,resp)] if(!missing(partial)) { predxy = paste(px,type,py,sep="") fit = update(partial,paste(".~.-",predxy)) y[[resp]] = residuals(fit) } mar = c(0.05,0.05,0,0.05) if(is.factor(y[[px]]) && is.factor(y[[py]])) { y = rtable(formula(y),y) xlab = if(px %in% ypred) "" else px linechart(t(y),xlab=xlab,ylab="",yaxt="n",row.labels=NULL) } else if(is.factor(y[[px]])) { plot.new() } else if(is.factor(y[[py]])) { fmla = formula(paste(resp,"~",px,"|",py)) predict.plot(fmla,y,xlab="",ylab="",xaxt="n",yaxt="n",key=F) } else if(is.factor(y[[resp]])) { color.plot(y, xlab="",ylab="",mar=mar,key=F,axes=F,...) } else { fmla = formula(paste(resp,"~",px,"+",py)) color.plot(loess(fmla,y,span=span), xlab="",ylab="",mar=mar,key=F,axes=F,...) } predxy = paste(px,":",py,sep="") predyx = paste(py,":",px,sep="") if(any(c(predxy,predyx) %in% highlight)) { box(col="red") } else { box() } } } } } interact.plot.lm <- function(object,data,partial=F,...) { if(!partial) { if(missing(data)) { res <- residual.frame(object) } else { res <- residual.frame(object,data) } cat("plotting residuals\n") interact.plot.data.frame(res,highlight=predictor.vars(object),...) } else { if(missing(data)) data <- model.frame(object) cat("plotting partial residuals\n") interact.plot.data.frame(data,partial=object,...) } } interact.plot <- function(object, ...) UseMethod("interact.plot") ############################################################################## # glm stuff predict.frame <- function(object) { # returns a frame relating predicted values to actual values # for glms, the predicted value is the value before transformation resp = response.var(object) frame = data.frame(z=predict(object),response=model.frame(object)[[resp]]) names(frame)[2] = resp frame } model.plot <- function(object,data,partial=F,smooth=F,se=F, col="green",lwd=2,add=F,lty=1, bg=par("bg"),...) { if(missing(data)) data <- model.frame(object) resp = response.var(object) pred = predictor.terms(object) data <- expand.frame(data,setdiff(pred,predictor.vars(x))) p <- predict(object,data,type="response",se=se) if(inherits(object,"glm")) { if(!se) p = p+1 else p$fit = p$fit+1 } if(add) { # must have one predictor term if(length(pred) > 1) stop("model has more than one predictor") x <- data[[pred]] i <- order(x) x <- sort(x) if(!se) { lines(x,p[i],col=col,lwd=lwd,lty=lty,...) } else { lines(x,p$fit[i],col=col,lwd=lwd,lty=lty,...) lines(x,p$fit[i]+p$se[i],col=col,lty=2,lwd=lwd,...) lines(x,p$fit[i]-p$se[i],col=col,lty=2,lwd=lwd,...) } return(invisible(p)) } layout <- auto.layout(length(pred)) opar <- par(bg=bg) if(any(layout != c(1,1))) { opar = c(opar,par(mfrow=layout)) } on.exit(par(opar)) for(v in pred) { plot(data[[v]],data[[resp]],xlab=v,ylab=resp,...) x = data[[v]] if(length(pred) == 1) { i <- order(x) lines(x[i],p[i],col=col,lwd=lwd,lty=lty,...) } else { if(partial) { fit <- update(object,paste(".~. -",v)) p = predict(fit,data,type="response",se=se) if(inherits(object,"glm")) { if(!se) p = p+1 else p$fit = p$fit+1 } } lines(smooth(x,p),col=col,lwd=lwd,lty=lty,...) } if(smooth) { lines(loprob(x,data[[resp]]),col=2,lwd=lwd,lty=lty,...) } } } misclass.glm <- function(fit,data,rate=F) { # model.frame is needed to evaluate I() terms if(missing(data)) data <- model.frame(fit) else data = model.frame(formula(fit),data) resp <- response.var(fit) lev = levels(data[[resp]]) p <- factor.logical(predict(fit,data,type="response")>0.5,labels=lev) r = sum(as.numeric(p) != as.numeric(data[[resp]])) if(rate) r/nrow(data) else r } misclass.nnet = function(fit,data) { # model.frame is needed to evaluate I() terms if(missing(data)) data <- model.frame(fit) else data = model.frame(formula(fit),data) resp <- response.var(fit) p <- predict(fit,data,type="class") return(sum(p != as.character(data[[resp]]))) } misclass <- function(object, ...) UseMethod("misclass") formula.nnet = formula.lm deviance.glm <- function(object, newdata) { if(missing(newdata)) return(object$deviance) resp <- response.var(object) if(object$family$family == "binomial") { truth <- (as.numeric(newdata[[resp]]) == 2) p <- predict(object,newdata,type="response") p[p == 0] <- 1e-3 p[!truth] <- 1-p[!truth] -2*sum(log(p)) } else { stop("family not handled") } } confusion.tree <- function(tr,data,p=0.5) { resp <- response.var(tr) if(p == 0.5) { r <- predict(tr,data,type="class") } else { r = predict(tr,data,type="vector")[,2] r = factor.logical(r>p,labels=levels(data[[resp]])) } table(truth=data[[resp]],predicted=r) } confusion.knn = confusion.tree factor.logical <- function(x,labels=c("No","Yes")) { f <- factor(x,levels=c(F,T)) levels(f) <- labels f } confusion.glm <- function(fit,data,p=0.5) { # model.frame is needed to evaluate I() terms if(missing(data)) data <- model.frame(fit) else data = model.frame(formula(fit),data) resp <- response.var(fit) r = predict(fit,data,type="response") r <- factor.logical(r>p,labels=levels(data[[resp]])) table(truth=data[[resp]],predicted=r) } confusion <- function(object, ...) UseMethod("confusion") # expands a model formula to include all squares and cross-products of the # predictors expand.quadratic <- function(fmla,cross=T) { resp <- response.var(fmla) pred <- predictor.vars(fmla) pred2 <- sapply(pred,function(s) paste("I(",s,"^2)",sep="")) s <- paste(resp,"~",paste(pred,collapse="+"), "+",paste(pred2,collapse="+")) len <- length(pred) if(cross && len > 1) { pred3 <- c() for(i in 1:(len-1)) { for(j in (i+1):len) { pred3 <- c(pred3, paste(pred[i],":",pred[j],sep="")) } } s <- paste(s,"+",paste(pred3,collapse="+")) } formula(s) } logistic <- function(formula,data) { #data <- model.frame(formula,data) resp = response.var(formula) pred <- predictor.vars(formula) if(!all(sapply(data[pred],is.numeric))) stop("all predictors must be numeric") object <- list(terms=terms(data),data=data) class(object) <- "logistic" object$shift = mean(data[pred]) object$scale <- sd(data[pred]) object$scale[object$scale == 0] = 1 data[pred] = scale(data[pred]) w = rep(0,length(predictor.terms(formula))+3) object$nnet = nnet(formula,data,Wts=w,size=1,decay=10) object$levels = levels(data[[resp]]) object } if(T) { logistic <- function(formula,data,...) { environment(formula) = sys.frame(sys.nframe()) glm(formula,data,family=binomial,maxit=50,...) } } predict.logistic <- function(object,test,type=c("class","response","vector"), ...) { resp <- response.var(object) pred <- predictor.vars(object) xt <- scale(test[pred],center=object$shift,scale=object$scale) type <- match.arg(type) if(type == "class") { factor(predict(object$nnet,xt,type="class"),levels=object$levels) } else { # type vector p = predict(object$nnet,xt,type="raw") if(type == "response") { p } else { lev <- object$levels v <- array(0,c(length(p),length(lev)),list(rownames(test),lev)) v[,1] = 1-p v[,2] = p v } } } misclass.logistic = misclass.nnet model.frame.logistic = function(object) object$data # functions for response tables # Tom Minka 11/8/01 # given an aov fit, plot the effects for each predictor # as well as a boxplot of the residuals # you can do an F-test visually by comparing the spread of the effects # with the spread of the residuals effects.plot <- function(object,se=F) { mt <- model.tables(object,se=se) vars <- names(mt$tables) nvar <- length(vars) opar <- par(mar=c(2.5,4,0,0.1)) on.exit(par(opar)) plot.new() ylim <- c(min(sapply(mt$tables,min)), max(sapply(mt$tables,max))) plot.window(xlim=c(0.5,nvar+1+0.5),ylim=ylim) axis(1,1:(nvar+1), labels=c(vars,"residuals")) axis(2) title(ylab=response.var(object)) if(se) { if(!is.numeric(se)) se <- 1.96 p <- (1-2*pnorm(-se))*100 cat("Displaying ",format(p,digits=2),"% confidence intervals\n",sep="") } for(k in 1:nvar) { eff <- mt$tables[[k]] text(k, eff, names(eff)) if(se) { jitter <- runif(length(eff))*0.1 arrows(k+jitter, eff-se*mt$se[[k]], k+jitter, eff+se*mt$se[[k]], code = 3, col = "green", angle = 75, length = .1) } } res <- residuals(object) x <- model.frame(object) res <- tapply(res, x[vars], mean) boxplot(res, at=nvar+1, add=T) } ############################################################################# as.rtable <- function(a,resp="Response",check.names=T) { if(check.names) resp = make.names(resp) nam = names(dimnames(a)) if(is.null(nam)) stop("Dimensions must have names") fmla <- paste(resp,"~", paste(make.names(nam),collapse="+")) attr(a,"terms") <- terms(formula(fmla)) attr(a,"ordered") <- dim.ordered(a) class(a) <- c("rtable","table") a } print.rtable <- function(rt,...) { cat(response.var(rt),"\n") # strip attributes and print as a matrix attributes(rt) <- attributes(rt)[c("dim","dimnames")] print(rt) } terms.rtable <- function(rt) { attr(rt,"terms") } as.data.frame.rtable <- function(rt,...) { df <- as.data.frame.table(rt,...) # preserve ordering of factors i = names(which(dim.ordered(rt))) if(length(i) > 0) df[i] = apply.df(df[i],as.ordered) len <- length(names(df)) names(df)[len] <- response.var(rt) df } as.matrix.rtable <- function(rt,...) { class(rt) = NULL rt } row.probs <- function(x,smooth=0.5,se=F) { # converts ctable to rtable x = x + smooth col.sum = rep.row(colSums(x),nrow(x)) nam = names(dimnames(x)) if(is.null(nam) || nam[1] == "") nam = "row" p = as.rtable(x/col.sum,paste("Probability of",nam[1])) if(se != F) { if(se == T) se = 1.64 p.se = sqrt(p*(1-p)/col.sum) * se list(p=p,se=p.se) } else p } log.counts <- function(x,smooth=0.5,se=F) { x = x+smooth r = as.rtable(log(x),"log(count)") if(se != F) { if(se == T) se = 1.64 se = 1/sqrt(x) * se list(r=r,se=se) } else r } logit.rtable = function(x,smooth=0.5,se=F) { resp = response.var(x) pred = predictor.vars(x) lev = levels(x[[resp]]) i = (x[[resp]]==lev[1]) y1 = table(x[i,pred])+smooth y2 = table(x[!i,pred])+smooth r = log(y2/y1) r = as.rtable(r,paste("Logit of",resp)) if(se != F) { if(se == T) se = 1.64 se = (1/sqrt(y1)+1/sqrt(y2)) * se list(r=r,se=se) } else r } pclass.rtable = function(x,pclass=2,smooth=0.5,se=F) { resp = response.var(x) pred = predictor.vars(x) lev = levels(x[[resp]]) i = (x[[resp]]==lev[pclass]) y1 = table(x[i,pred])+smooth y2 = table(x[!i,pred])+smooth n = y1+y2 p = y1/(y1+y2) p = as.rtable(p,paste("Probability of",resp,"=",lev[pclass])) if(se != F) { if(se == T) se = 1.64 se = sqrt(p*(1-p)/n) * se list(p=p,se=se) } else p } aov.rtable <- function(rt,med=F) { frame <- as.data.frame.rtable(rt) if(med) { p <- medpolish(rt,trace.iter=F) return(structure(terms=terms(rt), model=frame, class="aov")) } aov(terms(rt), frame) } aov.residual <- function(rt,...) { rt - rtable(aov.rtable(rt,...)) } # returns a table of responses, stratified according to the terms in fmla # and aggregated according to fun. rtable.terms <- function(fmla, x, fun = mean) { resp <- response.var(fmla) pred <- predictor.vars(fmla) rt <- tapply(x[[resp]], x[pred], fun) if(length(dim(rt)) == 1) { # force it to be a 1-row table dim(rt) = c(1,length(rt)) dimnames(rt) = named.list(NULL,levels(x[[pred]]),names.=c("",pred)) } class(rt) <- "rtable" attr(rt,"terms") <- fmla attr(rt,"ordered") <- sapply(x[pred],is.ordered) rt } # model.frame is a data.frame with a "terms" attribute describing a model rtable.data.frame <- function(object, ...) { rtable(terms(object), object, ...) } rtable.aov <- function(object, ...) { x <- model.frame(object) resp <- response.var(x) pred <- predictor.vars(x) y <- expand.grid(lapply(x[pred],levels)) y[[resp]] <- predict(object,y) y <- model.frame(terms(x),y) rtable(y, ...) } # legal input: # rtable(y~f1+f2,x) # rtable(y~.,x) # rtable(x$y ~ x$f1 + x$f2) rtable.formula <- function(formula, data, ...) { # convert "|" into "+" rhs = formula[[3]] if(is.call(rhs) && (deparse(rhs[[1]]) == "|")) { rhs[[1]] = as.name("+") formula[[3]] = rhs model = model.frame.default(formula,data) } else { expr <- match.call(expand = F) expr$... <- NULL expr[[1]] <- as.name("model.frame.default") model <- eval(expr, parent.frame()) } return(rtable(model, ...)) } rtable <- function(object, ...) UseMethod("rtable") # plots rows of x as traces # similar to parallel.plot # replacement for dotchart # y is a named vector or matrix linechart <- function(y,se=NULL,xlab,ylab,standard=F,med=F, xscale=c("equal","linear","none"),...) { if(is.list(y) && missing(se)) { # y is list(y,se) se = y[[2]] y = y[[1]] } if(is.vector(y)) y = t(as.matrix(y)) if(!is.matrix(y)) y = as.matrix(y) #if(missing(ylab)) ylab <- deparse(substitute(y)) row.var <- names(dimnames(y))[1] col.var <- names(dimnames(y))[2] if(missing(xlab)) xlab = col.var if(F) { # remove all-NA rows and columns y = y[!apply(y,1,function(z) all(is.na(z))),] y = y[,!apply(y,2,function(z) all(is.na(z)))] } xscale <- match.arg(xscale) if(standard) { if(missing(ylab)) ylab <- paste(row.var, "effect on", response.var(y)) # choose x to make the profiles linear # fails if there are missing values if(med) { library(eda) fit <- medpolish(y,trace.iter=F) col.centers <- fit$col + fit$overall resid <- fit$residuals } else { col.centers <- colMeans(y,na.rm=T) row.effects <- rowMeans(y,na.rm=T) - mean(as.vector(y)) resid <- y - outer(row.effects, col.centers, "+") } # subtract column centers y <- y - rep.row(col.centers,nrow(y)) } else { if(missing(ylab)) ylab <- response.var(y) # sort columns by center #v = col.centers resid <- y - rep.col(rowMeans(y),ncol(y)) } if(any(is.na(resid))) { if(xscale != "none" && !dim.ordered(y)[2]) cat("Warning: NAs prevent automatic ordering\n") xscale = "none" } if(xscale == "none") { v = 1:ncol(y) } else { s <- svd(resid) v <- s$v[,1] } # v is the desired x positions # remove flip ambiguity # choose the ordering which has maximum inner product with 1:ncol if(sum((1:length(v))*v) < sum((length(v):1)*v)) v = -v i = rank.stable(v) in.order = (!dim.ordered(y)[2] || all(i == 1:length(i))) if(in.order) { x <- switch(xscale,linear=v,equal=i,none=i) } else { x <- 1:ncol(y) } i <- order(x) y <- reorder.cols(y,i) x <- x[i] if(!is.null(se)) { se = reorder.cols(se,i) # kludge for call below se = t(se) } names(x) = colnames(y) labeled.curves(x,t(y),se,xlab=xlab,ylab=ylab,...) } labeled.curves <- function(x,y,se=NULL,labels,xlab,ylab,type="o", color.palette=default.colors(6),col, lty.palette=1:6,lty,lwd=2,jitter=0, legend.=F,las=par("las"), xaxt=par("xaxt"),yaxt=par("yaxt"), cex=par("cex"),horizontal=F, mar,ylim,...) { # x is vector # y is a list or matrix of columns # use jitter = 0.02 to avoid overplotting of error bars if(horizontal) legend. = T nam = NULL if(is.matrix(y)) { nam = names(dimnames(y)) y = as.data.frame.matrix(y) } n = length(y) #if(is.data.frame(y)) y = as.list(y) if(missing(xlab)) { if(!is.null(nam)) xlab = nam[1] else xlab = "" } if(missing(ylab)) { if(!is.null(nam)) ylab = nam[2] else ylab = "" } if(missing(mar)) { mar = if(horizontal) auto.mar(xlab=ylab,ylab=xlab,xaxt=xaxt,yaxt=yaxt,las=las) else auto.mar(xlab=xlab,ylab=ylab,xaxt=xaxt,yaxt=yaxt,las=las) } opar <- par(mar=mar) on.exit(par(opar)) plot.new() xlim <- range(x) if(missing(labels)) labels = names(y) if(!is.null(labels) && !legend.) { # make space for labels xspc = 0.1 w <- (max(strwidth(labels,units="inches",cex=cex))+xspc)/par("pin")[1] xlim[2] <- xlim[2] + diff(xlim)*w/(1-w) } if(!is.null(se)) { if(is.matrix(se)) se = as.data.frame.matrix(se) if(any(dim(se) != dim(y))) stop("se not same size as y") if(missing(ylim)) { vec.y = unlist(y) vec.se = unlist(se) # need y in case se is NA ylim = range(c(vec.y,vec.y-vec.se,vec.y+vec.se),finite=T) } # make room for arrow heads w = 0.2/par("pin")[1] dw = diff(xlim)*w/(1-w) xlim[1] = xlim[1] - dw/2 xlim[2] = xlim[2] + dw/2 } else { if(missing(ylim)) ylim = range(unlist(y),finite=T) } if(horizontal) { plot.window(xlim=ylim,ylim=xlim) if(is.null(names(x))) { if(xaxt != "n") axis(2,las=las,...) grid(nx=0) } else { cat("columns are",names(x),"\n") if(xaxt != "n") axis(2,x, labels=names(x),las=las,...) # grid abline(h=seq(1,length(x)),lty="dotted",col="lightgray") } if(yaxt != "n") axis(1,las=las,...) } else { plot.window(xlim=xlim,ylim=ylim) if(is.null(names(x))) { if(xaxt != "n") axis(1,las=las,...) grid(ny=0) } else { cat("columns are",names(x),"\n") if(xaxt != "n") axis(1,x, labels=names(x),las=las,...) # grid abline(v=seq(1,length(x)),lty="dotted",col="lightgray") } if(yaxt != "n") axis(2,las=las,...) } box() q = 1:n if(missing(col)) { if(mode(color.palette) == "function") color.palette <- color.palette(n) col <- color.palette[((q-1) %% length(color.palette))+1] } if(missing(lty)) { lty <- lty.palette[(((q-1) %/% length(color.palette)) %% length(lty.palette))+1] } jitter = jitter*((1:n)-0.5-n/2) if(length(x) > 1) jitter = jitter*min(diff(x)) if(!is.null(labels) && !legend.) { xspc = xinch(xspc) txt.x = txt.y = c() } for(i in 1:n) { if(horizontal) { lines(y[[i]],x,col=col[i],lty=lty[i],type=type,lwd=lwd, ...) if(!is.null(se)) { # arrows do not have lty arrows(y[[i]]-se[[i]],x+jitter[i],y[[i]]+se[[i]],x+jitter[i], len=0.1,angle=75,code=3,col=col[i],lty=lty[i],lwd=lwd,xpd=T,...) } } else { lines(x, y[[i]],col=col[i],lty=lty[i],type=type,lwd=lwd, ...) if(!is.null(se)) { # arrows do not have lty arrows(x+jitter[i],y[[i]]-se[[i]],x+jitter[i],y[[i]]+se[[i]], len=0.1,angle=75,code=3,col=col[i],lwd=lwd,xpd=T,...) } } if(!is.null(labels) && !legend.) { # find the rightmost non-NA column in this row j <- rev(which(!is.na(y[[i]])))[1] txt.x[i] = x[j]+xspc txt.y[i] = y[[i]][j] } } if(!is.null(labels) && !legend.) { # move text vertically to avoid collisions h = strheight(labels,units="inch",cex=cex) names(txt.y) = labels txt.y = yinch(move.collisions(inchy(txt.y),h)) text(txt.x, txt.y, labels, col=col, adj=0, cex=cex, ...) } if(!is.null(labels) && legend.) { #auto.legend(labels,col=col,lty=lty,text.col=col) #legend(par("usr")[2],par("usr")[4],labels,lty=lty,col=col,xjust=1) legend(par("usr")[2],par("usr")[3],labels,lty=lty,col=col,xjust=1,yjust=0) } if(horizontal) title.las(ylab=xlab,xlab=ylab,las=las,...) else title.las(xlab=xlab,ylab=ylab,las=las,...) } title.las <- function(xlab=NULL,ylab=NULL,las=par("las"),...) { if(las == 0) { title(xlab=xlab,ylab=ylab,...) } else if(las == 1) { title(xlab=xlab,...) title(ylab=ylab,line=5,...) } else if(las == 2) { title(xlab=xlab,line=5.5,...) title(ylab=ylab,line=5,...) } else if(las == 3) { title(ylab=ylab,...) title(xlab=xlab,line=5.5,...) } } # Extensions and uses of loess ############################################################################# # bugfix library(modreg) loess <- function(formula, data=NULL, weights, subset, na.action, model = FALSE, span = 0.75, enp.target, degree = 2, parametric = FALSE, drop.square = FALSE, normalize = TRUE, family = c("gaussian", "symmetric"), method = c("loess", "model.frame"), control = loess.control(...), ...) { mt <- terms(formula, data = data) mf <- match.call(expand.dots=FALSE) mf$model <- mf$span <- mf$enp.target <- mf$degree <- mf$parametric <- mf$drop.square <- mf$normalize <- mf$family <- mf$control <- mf$... <- NULL # minka: bugfix mf$method <- NULL mf[[1]] <- as.name("model.frame") mf <- eval(mf, parent.frame()) if (match.arg(method) == "model.frame") return(mf) na.act <- attr(mf, "na.action") y <- model.response(mf, "numeric") w <- model.weights(mf) if(is.null(w)) w <- rep(1, length(y)) nmx <- as.character(attr(mt, "variables"))[-(1:2)] x <- mf[, nmx, drop=FALSE] if(any(sapply(x, is.factor))) stop("predictors must all be numeric") x <- as.matrix(x) D <- ncol(x) nmx <- colnames(x) names(nmx) <- nmx drop.square <- match(nmx, nmx[drop.square], 0) > 0 parametric <- match(nmx, nmx[parametric], 0) > 0 if(!match(degree, 0:2, 0)) stop("degree must be 0, 1 or 2") iterations <- if(family=="gaussian") 1 else control$iterations if(!missing(enp.target)) if(!missing(span)) warning("both span and enp.target specified: span will be used") else { # White book p.321 tau <- switch(degree+1, 1, D+1, (D+1)*(D+2)/2) - sum(drop.square) span <- 1.2 * tau/enp.target } fit <- simpleLoess(y, x, w, span, degree, parametric, drop.square, normalize, control$statistics, control$surface, control$cell, iterations, control$trace.hat) fit$call <- match.call() fit$terms <- mt fit$xnames <- nmx fit$x <- x fit$y <- y fit$weights <- w if(model) fit$model <- mf if(!is.null(na.act)) fit$na.action <- na.act fit } # minka: robust by default formals(loess)$family = "symmetric" #formals(loess)$family = "gaussian" predict.loess <- function(object, newdata = NULL, se = FALSE, ...) { if(!inherits(object, "loess")) stop("First argument must be a loess object") if(is.null(newdata) & (se == FALSE)) return(fitted(object)) if(is.null(newdata)) newx <- object$x else { vars <- as.character(attr(delete.response(terms(object)), "variables"))[-1] if(length(vars) > 1 || NCOL(newdata) > 1) { if(any(!match(vars, colnames(newdata), FALSE))) # minka newdata = model.frame.default(delete.response(terms(object)),newdata) else newdata = newdata[,vars,drop=F] } newx = as.matrix(newdata) } res <- predLoess(object$y, object$x, newx, object$s, object$weights, object$pars$robust, object$pars$span, object$pars$degree, object$pars$normalize, object$pars$parametric, object$pars$drop.square, object$pars$surface, object$pars$cell, object$pars$family, object$kd, object$divisor, se=se) if(se) res$df <- object$one.delta^2/object$two.delta res } ############################################################################## smooth <- function(x,y) { library(modreg) if(length(unique(x)) >= 4) { fit = smooth.spline(x,y) predict(fit) } else { lowess(x,y) } } plot.loess <- function(object,xlim,col=2,lwd=2,res=200,asp=NULL,...) { x <- model.frame(object) print(x) pred <- predictor.vars(x) resp <- response.var(x) if(missing(xlim)) xlim <- range(x[[pred]]) xt <- seq(xlim[1],xlim[2],len=res) y <- predict(object,xt) real.asp = if(identical(asp,"auto")) auto.aspect(xt,y) else asp plot(x[[pred]],x[[resp]],xlab=pred,ylab=resp,asp=real.asp,...) lines(xt,y,col=col,lwd=lwd,...) } if(T) { smooth <- function(...) { library(modreg) expr <- match.call() expr[[1]] <- as.name("loess") object <- eval(expr, parent.frame()) #object = loess(...) class(object) = c("smooth","loess") object } print.smooth = print.loess plot.smooth = plot.loess predict.smooth = predict.loess } else { smooth <- function(fmla,data=parent.frame(),span=2/3,...) { given = given.vars(fmla) if(!is.null(given)) { fmla2 = remove.given(fmla) g = data[[given]] if(!is.factor(g)) stop("given must be a factor") children = list() for(lev in levels(g)) { children[[lev]] = smooth(fmla2,data[g == lev,],span,...) } object = list(model=data,terms=terms(fmla),given=given,children=children) class(object) = "multi.smooth" return(object) } x = model.frame(fmla,data) pred = predictor.vars(x)[1] resp = response.var(x) xy = lowess(x[[pred]],x[[resp]],f=span,...) names(xy$x) = rownames(x) names(xy$y) = rownames(x) res = x[[resp]] - xy$y orig.x = as.named.vector(data[pred]) object = c(xy,list(model=x,terms=terms(x),residuals=res,orig.x=orig.x)) class(object) = "smooth" object } print.multi.smooth <- function(object) { cat("Multi-smooth object: ") print(formula(object)) cat(object$given,"values:", names(object$children), "\n") #print(object$children) } print.smooth <- function(object) { cat("Smooth object: ") print(formula(object)) } predict.multi.smooth <- function(object,frame) { if(missing(frame)) frame = object$model g = frame[,object$given] r = named.numeric(nrow(frame),rownames(frame)) for(lev in levels(g)) { i = (g == lev) r[i] = predict.smooth(object$children[[lev]], frame[i,]) } r } predict.smooth <- function(object,x=object$orig.x) { if(is.data.frame(x)) x = x[[predictor.vars(object)[1]]] y = approx(object$x,object$y,x)$y if(is.null(names(y))) names(y) = names(x) y } } model.frame.smooth = model.frame.loess lowess2 <- function(x,y,f=2/3,res=200,...) { object = smooth(y~x,data.frame(x,y),span=f,...) xlim <- range(x) xo <- seq(xlim[1],xlim[2],len=res) yo <- predict(object,xo) list(x=xo,y=yo) } loprob <- function(x,y) { # simulates lowess for binary response # returns list(x,y) lev <- levels(y) if(length(lev) != 2) stop("y must have two levels") if(!is.factor(x)) { from <- min(x) to <- max(x) } if(T) { # smoothing spline library(modreg) y <- as.numeric(y)-1 r <- seq(from,to,length=50) if(length(unique(x)) >= 4) { fit = smooth.spline(x,y) p <- predict(fit,r)$y } else { fit <- loess(y~x,degree=0) p <- predict(fit,r) } list(x=r,y=p+1) } else if(F) { # density estimate - too bumpy densf <- function(x) { if(is.factor(x)) { list(x=levels(x),y=tabulate(x)/length(x)) } else { density(x,from=from,to=to,adjust=1.5) } } dens <- lapply(split(x,y),densf) p <- sum(y==levels(y)[2])/length(y) p <- p*dens[[2]]$y/((1-p)*dens[[1]]$y + p*dens[[2]]$y) list(x=dens[[1]]$x,y=p+1) } else if(F) { # hill density estimate - too noisy xr <- seq(from,to,length=50) densf <- function(x) hill.density(x,xr) dens <- lapply(split(x,y),densf) p <- dens[[2]]/(dens[[1]] + dens[[2]]) list(x=xr,y=p+1) } else { # gam smooth - too slow library(mgcv) y <- as.numeric(y)-1 fit <- gam(y~s(x),family=binomial) r <- seq(from,to,length=50) p <- predict(fit,data.frame(x=r),type="response") list(x=r,y=p+1) } } ############################################################################## # 3D plots # x,y,r are in user coordinates draw.circle <- function(x,y,r,res=64,...) { theta <- seq(0,2*pi,len=res) if(length(r) == 1) r <- rep(r,length(x)) xt <- cos(theta) yt <- sin(theta) for(i in 1:length(x)) { polygon(x[i]+xt*r[i],y[i]+yt*r[i],...) } } draw.vane <- function(x,y,a,r) { xlim <- par("usr")[1:2] ylim <- par("usr")[3:4] xd <- r*cos(a)*diff(xlim)/par("pin")[1] yd <- r*sin(a)*diff(ylim)/par("pin")[2] segments(x-xd/2,y-yd/2,x+xd/2,y+yd/2) } draw.brick <- function(x,y,w,h) { xlim <- par("usr")[1:2] ylim <- par("usr")[3:4] w <- w*diff(xlim)/par("pin")[1] h <- h*diff(ylim)/par("pin")[2] # these drawing directions are important to get completed corners segments(x-w/2,y+h/2,x+w/2,y+h/2) segments(x+w/2,y-h/2,x-w/2,y-h/2) segments(x-w/2,y-h/2,x-w/2,y+h/2) segments(x+w/2,y+h/2,x+w/2,y-h/2) } bubbles <- function(object, ...) UseMethod("bubbles") bubbles.formula <- function(formula,data=parent.frame(),...,xlab,ylab,zlab) { x <- model.frame.default(formula,data,na.action=na.pass) resp = response.var(x) pred = predictor.vars(x) if(missing(xlab)) xlab = pred[1] if(missing(ylab)) ylab = pred[2] if(missing(zlab)) zlab = resp bubbles.default(x[,pred[1]],x[,pred[2]],x[,resp], xlab=xlab,ylab=ylab,zlab=zlab,...) } bubbles.default <- function(x,y,z,data=parent.frame(), zlim,zex=1,xlab,ylab,zlab,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) if(is.null(data)) data <- parent.frame() x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) opar <- par(mar=auto.mar(main=zlab,xlab=xlab,ylab=ylab)) on.exit(par(opar)) plot(x,y,type="n",xlab=xlab,ylab=ylab,main=zlab,...) if(!missing(zlim)) z <- z[(z >= zlim[1]) & (z <= zlim[2])] z = rank(z) z <- scale.range(z) # avoid zero-size bubbles z <- (z + 0.025)/1.025 if(T){ # encode z as area z <- sqrt(z) } z <- z*zex for(i in 1:length(x)) { points(x[i],y[i],pch=1,cex=z[i]*3,...) } } vanes <- function(x,y,z,data=parent.frame(),w=NULL, zlim,wlim,wex=0.1,xlab,ylab,zlab,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) opar <- par(mar=auto.mar(main=zlab,xlab=xlab,ylab=ylab)) on.exit(par(opar)) plot(x,y,type="n",xlab=xlab,ylab=ylab,main=zlab,...) if(!missing(zlim)) z <- z[(z >= zlim[1]) & (z <= zlim[2])] if(!missing(wlim)) w <- w[(w >= wlim[1]) & (w <= wlim[2])] # map z to (0,pi/2) z = rank(z) z <- scale.range(z)*pi/2 if(is.null(w)) w <- 1 if(length(w) == 1) w <- rep(w,length(z))*wex else w <- (w - min(w,na.rm=T))/diff(range(w,na.rm=T))*wex draw.vane(x,y,z,w) } bricks <- function(x,y,w,data=parent.frame(),h=NULL, wex=0.1,hex=0.1,xlab,ylab,wlab,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(wlab)) wlab <- deparse(substitute(w)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) w <- eval(substitute(w),data) opar <- par(mar=auto.mar(main=zlab,xlab=xlab,ylab=ylab)) on.exit(par(opar)) plot(x,y,type="n",xlab=xlab,ylab=ylab,main=wlab,...) w <- scale.range(w)*wex if(is.null(h)) h <- 0.5 if(length(h) == 1) h <- rep(h,length(x))*hex else h <- scale.range(h)*hex draw.brick(x,y,w,h) } plot3d <- function(object, ...) UseMethod("plot3d") plot3d.formula <- function(formula,data=parent.frame(),...,xlab,ylab,zlab) { x <- model.frame.default(formula,data,na.action=na.pass) resp = response.var(x) pred = predictor.vars(x) if(missing(xlab)) xlab = pred[1] if(missing(ylab)) ylab = pred[2] if(missing(zlab)) zlab = resp plot3d.default(x[,pred[1]],x[,pred[2]],x[,resp], xlab=xlab,ylab=ylab,zlab=zlab,...) } plot3d.default <- function(x,y,z,data=parent.frame(),xlab,ylab,zlab,angle=60,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) library(scatterplot3d) scatterplot3d(x,y,z,xlab=xlab,ylab=ylab,zlab=zlab,angle=angle,...) } # clip specifies a polygon in (x,y) over which the surface is to be defined. # possible values are F (no clipping), T (clip to the convex hull of the # data), a list(x,y), or a matrix with two columns. persp.loess <- function(object,res=20,theta=30,shade=0.65, xlim,ylim,zlim,clip=T,...) { if(length(res) == 1) res <- rep(res,2) x <- model.frame(object) resp <- response.var(object) pred <- predictor.vars(object) if(missing(xlim)) xlim <- range(x[[pred[1]]]) x1 <- seq(xlim[1],xlim[2],length=res[1]) if(missing(ylim)) ylim <- range(x[[pred[2]]]) x2 <- seq(ylim[1],ylim[2],length=res[2]) xt <- expand.grid(x1,x2) names(xt) <- pred[1:2] z <- predict(object,xt) if(!missing(zlim)) { z[z < zlim[1]] <- zlim[1] z[z > zlim[2]] <- zlim[2] } if(clip != F) { if(clip == T) { i <- chull(x[pred]) clip <- x[pred][i,] } z[!in.polygon(clip,xt)] <- NA } dim(z) <- c(length(x1),length(x2)) xlab <- pred[1] ylab <- pred[2] zlab <- resp opar <- par(mar=c(1,0,0,0)) on.exit(par(opar)) persp(x1,x2,z,xlab=xlab,ylab=ylab,zlab=zlab,theta=theta,shade=shade,...) } surface <- persp surface.loess <- persp.loess image.interp <- function(x,y,z,data=parent.frame(),res=100,clip,xlim,ylim, xlab,ylab,zlab,nlevels=128,breaks,col,bg, color.palette=YR.colors,...) { library(akima) if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) if(missing(xlim)) xlim <- range(x) xo <- seq(xlim[1],xlim[2],len=res) if(missing(ylim)) ylim <- range(y) yo <- seq(ylim[1],ylim[2],len=res) p <- interp(x,y,z,xo,yo) #p <- interp.new(x,y,z,xo,yo) if(!missing(clip) && any(clip != F)) { i <- !in.polygon(clip,expand.grid(xo,yo)) dim(i) <- c(length(xo),length(yo)) p$z[i] <- NA } if(missing(breaks)) breaks <- break.quantile(z,nlevels) if(missing(col)) col <- color.palette(length(breaks)-1) #p$z <- as.numeric(cut.quantile(p$z,5)) if(!missing(bg)) { opar <- par(bg=bg) on.exit(par(opar)) } image(p,xlab=xlab,ylab=ylab,main=zlab,breaks=breaks,col=col,...) } slices <- function(fmla,data=parent.frame(),nlevels=4, layout,xlab,ylab,zlab,xlim,ylim,...) { given = given.vars(fmla) if(is.null(given)) stop("formula must have conditioning (|)") fmla = remove.given(fmla) pred = predictor.vars(fmla)[1] resp = response.var(fmla) frame = model.frame.default(formula(paste(resp,"~",pred,"+",given)),data) x <- frame[,pred] y <- frame[,resp] z <- frame[,given] if(missing(xlab)) xlab <- pred if(missing(ylab)) ylab <- resp if(missing(zlab)) zlab <- given # because of unique, this is different from equal.count b <- break.quantile(z,nlevels) zf <- cut(z,b,include=T) if(missing(layout)) layout = auto.layout(nlevels) opar <- par(mfrow=layout) on.exit(par(opar)) if(missing(xlim)) xlim = range(x) if(missing(ylim)) ylim = range(y) for(lev in levels(zf)) { i <- (zf == lev) main = paste(zlab,"=",lev) # limits must be consistent predict.plot.default(x[i],y[i],rownames(frame)[i],xlab=xlab,ylab=ylab,main=main,xlim=xlim,ylim=ylim,...) } } slices2 <- function(x,y,z,data=parent.frame(),nlevels=4, digits=2,xlab,ylab,zlab,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) opar <- par(mar=c(0,0,0,0)) # lattice changes the mar library(lattice) par(opar) trellis.par.set("add.line",list(col=2,lwd=2)) #trellis.settings$background$col <- 0 #trellis.settings$plot.symbol$col <- 1 df <- data.frame(x,y,z) df$z <- equal.count(z,nlevels,overlap=0) nam <- sapply(c(xlab,ylab,zlab),make.names) names(df) <- nam fmla <- formula(paste(nam[2],"~",nam[1],"|",nam[3])) xyplot(fmla, data=df, panel = function(x, y) { panel.grid(v=2); panel.xyplot(x, y); panel.loess(x,y) }) } # extra routines for using regression trees # requires: tree package, cluster.r # Tom Minka 10/26/01 # plot tree partitions along with data colored by response color.plot.tree <- function(tr,data,col=3,lwd=2,add=F,...) { if(missing(data)) data <- model.frame(tr) resp <- response.var(tr) # reduce to the variables used in tr pred <- levels(factor(tr$frame$var[tr$frame$var != ""])) fmla <- formula(paste(resp,"~",paste(pred,collapse="+"))) data <- model.frame(fmla,data) if(!add) color.plot.data.frame(data,...) # use the par options set by color.plot.data.frame partition.tree(tr,add=T,ordvars=pred,col=col,lwd=lwd,...) } # returns the parent index of every node parents.tree <- function(tr) { i <- as.numeric(rownames(tr$frame)) match(i %/% 2, i) } is.left.child <- function(tr,i) { if(missing(i)) i <- 1:nrow(tr$frame) if(is.numeric(i)) i <- rownames(tr$frame)[i] (as.numeric(i) %% 2 == 0) } # converts tr$where into a factor of node names factor.tree <- function(tr) { f <- factor(rownames(tr$frame)[tr$where]) names(f) <- names(tr$where) f } # returns a vector of node indices under node # the input is a node name, but output is node number descendants.tree <- function(tr,node) { p <- parents.tree(tr) node <- match(node,rownames(tr$frame)) i <- 1:length(p) r <- c() repeat { r <- c(r,which(i==node)) i <- p[i] if(all(is.na(i) | (i 1) { indent <- substring(indent, 1, spaces * seq(depth)) indent <- paste(c("", indent[depth]), format(node), ")", sep = "") } else indent <- paste(format(node), ")", sep = "") if(is.prob) { yval <- paste(as.character(frame$yval), " (", sep = "") yprob <- format(frame$yprob, digits = digits) for(i in seq(ylevels)) yval <- paste(yval, yprob[, i]) yval <- paste(yval, ")") } else yval <- format(signif(frame$yval, digits = digits)) term <- rep(" ", length(depth)) term[frame$var == ""] <- "*" z <- labels.tree(x, pretty = pretty) # minka: omit deviance z <- paste(indent, z, format(round(frame$n,2)), yval, term) cat(z, sep = "\n") invisible(x) } # partition.tree must require cont predictors because split sets on a # factor can vary # minka: added digits as argument partition.tree <- function(tree, label = "yval", add = FALSE, ordvars, digits=2, force = T, main=NULL, ...) { ptXlines <- function(x, v, xrange, xcoord = NULL, ycoord = NULL, tvar, i = 1) { if(v[i] == "") { y1 <- (xrange[1] + xrange[3])/2 y2 <- (xrange[2] + xrange[4])/2 return(xcoord, ycoord = c(ycoord, y1, y2), i = i) } if(v[i] == tvar[1]) { xcoord <- c(xcoord, x[i], xrange[2], x[i], xrange[4]) xr <- xrange xr[3] <- x[i] ll2 <- Recall(x, v, xr, xcoord, ycoord, tvar, i + 1) xr <- xrange xr[1] <- x[i] return(Recall(x, v, xr, ll2$xcoord, ll2$ycoord, tvar, ll2$i + 1)) } else if(v[i] == tvar[2]) { xcoord <- c(xcoord, xrange[1], x[i], xrange[3], x[i]) xr <- xrange xr[4] <- x[i] ll2 <- Recall(x, v, xr, xcoord, ycoord, tvar, i + 1) xr <- xrange xr[2] <- x[i] return(Recall(x, v, xr, ll2$xcoord, ll2$ycoord, tvar, ll2$i + 1)) } else stop("Wrong variable numbers in tree.") } if(inherits(tree, "singlenode")) { warning("Singlenode tree has no partitions") return(invisible(NULL)) } if(!inherits(tree, "tree")) stop("Not legitimate tree") frame <- tree$frame leaves <- frame$var == "" var <- unique(as.character(frame$var[!leaves])) if(length(var) > 2 || length(var) < 1) stop("Tree can only have one or two predictors") nlevels <- sapply(xlevels <- attr(tree, "xlevels"), length) if(any(nlevels[var] > 0)) stop("Tree can only have continuous predictors") x <- rep(NA, length(leaves)) x[!leaves] <- as.double(substring(frame$splits[!leaves, "cutleft"], 2, 100)) m <- model.frame(tree) # minka: treat 1 var as if var 2 was yvar if(force && (length(var) == 1)) { var[2] <- response.var(tree) } if(length(var) == 1) { ## one x variable x <- sort(c(range(m[[var]]), x[!leaves])) if(is.null(attr(tree, "ylevels"))) y <- frame$yval[leaves] else y <- frame$yprob[leaves, 2] + 1 # minka: must extend y to match x y <- c(y,y[length(y)]) if(add) lines(x, y, type = "s", ...) else { # minka: use response.var resp <- response.var(m) xo <- m[[resp]] if(is.factor(xo)) ylim <- c(0,1) else ylim <- range(xo) plot(x, y, ylab = resp, xlab = var, type = "s", ylim = ylim, xaxs = "i", ...) } invisible(list(x = x, y = y)) } else { ## two x variables if(!missing(ordvars)) { ind <- match(var, ordvars) if(any(is.na(ind))) stop("unmatched names in vars") var <- ordvars[sort(ind)] } lab <- frame$yval[leaves] resp = response.var(tree) if(is.null(frame$yprob)) { lab <- format(signif(lab, digits)) } else if(match(label, attr(tree, "ylevels"), nomatch = 0)) { lab <- format(signif(frame$yprob[leaves, label], digits)) # minka: useful title if(is.null(main)) main = paste("Pr(",resp," = ",label,")",sep="") } # minka: useful title if(is.null(main)) main = resp rx <- range(m[[var[1]]]) rx <- rx + c(-0.025, 0.025) * diff(rx) rz <- range(as.numeric(m[[var[2]]])) rz <- rz + c(-0.025, 0.025) * diff(rz) xrange <- c(rx, rz)[c(1, 3, 2, 4)] xcoord <- NULL # x1lo, x2lo, x1hi, x2hi ycoord <- NULL # y1, y2 xy <- ptXlines(x, frame$var, xrange, xcoord, ycoord, var) xx <- matrix(xy$xcoord, nrow = 4) yy <- matrix(xy$ycoord, nrow = 2) if(!add) plot(rx, rz, xlab = var[1], ylab = var[2], type = "n", xaxs = "i", yaxs = "i", main = main, ...) # minka: use ... segments(xx[1, ], xx[2, ], xx[3, ], xx[4, ], ...) text(yy[1, ], yy[2, ], as.character(lab), ...) } } # minka: added newdata,rate options misclass.tree <- function(tree, newdata, rate = F, detail = FALSE) { if(!inherits(tree, "tree")) stop("Not legitimate tree") if(is.null(attr(tree, "ylevels"))) stop("Misclassification error rate is appropriate for factor responses only") if(!missing(newdata)) { resp <- response.var(tree) r = sum(predict.tree(tree, newdata, type="class") != newdata[[resp]]) return(if(rate) r/nrow(newdata) else r) } if(is.null(y <- tree$y)) y <- model.extract(model.frame(tree), "response") if(is.null(wts <- tree$weights)) wts <- model.weights(model.frame(tree)) if(is.null(wts)) wts <- rep(1, length(y)) frame <- tree$frame if(detail) { which <- descendants(as.numeric(row.names(frame))) tmp <- as.vector((which[, tree$where] * outer(frame$yval, y, "!=")) %*% wts) names(tmp) <- row.names(tree$frame) r = tmp } else r = sum(wts*(frame$yval[tree$where] != y)) if(rate) r/length(y) else r } # minka: added newdata option deviance.tree <- function(object, newdata, detail = FALSE, ...) { if(!inherits(object, "tree")) stop("Not legitimate tree") if(!missing(newdata)) { yvar <- response.var(object) truth <- as.numeric(newdata[[yvar]]) p <- predict.tree(object, newdata) # correction used in prune.tree p[p == 0] <- 1e-3 i <- (1:nrow(p)) + (truth-1)*nrow(p) return(-2*sum(log(p[i]))) } frame <- object$frame if(detail) frame$dev else sum(frame$dev[frame$var == ""]) } cv.tree <- function(object, rand, FUN = prune.tree, K = 10, ...) { if(!inherits(object, "tree")) stop("Not legitimate tree") m <- model.frame(object) extras <- match.call(expand.dots = FALSE)$... FUN <- deparse(substitute(FUN)) init <- do.call(FUN, c(list(object), extras)) if(missing(rand)) rand <- sample(K, length(m[[1]]), replace = TRUE) cvdev <- 0 for(i in unique(rand)) { # minka: must build a tree as big as original tlearn <- tree(model = m[rand != i, , drop = FALSE], mindev=0) plearn <- do.call(FUN, c(list(tlearn, newdata = m[rand ==i, , drop = FALSE], k = init$k), extras)) cvdev <- cvdev + plearn$dev } init$dev <- cvdev init } model.frame.tree <- function(formula, ...) { m <- formula$model if(!is.null(m)) return(m) oc <- formula$call if(substring(deparse(oc[[1]]), 1, 7) == "predict") { m <- eval(oc$newdata) if(is.null(attr(m, "terms"))) { object <- eval(oc$object) m <- model.frame(object$terms, m, na.pass) } return(m) } while(deparse(oc[[1]]) != "tree") oc <- eval(oc[[2]])$call oc$subset <- names(formula$where) oc$method <- "model.frame" # minka: use the environment of terms, not the current env env = environment(formula$terms) if (is.null(env)) env<-parent.frame() eval(oc,env) } # bugfix for ts library library(ts,warn=F) as.data.frame.ts <- function(x,optional) { nam <- deparse(substitute(x)) df <- as.data.frame.vector(as.vector(x)) names(df) <- nam cbind(data.frame(time=as.vector(time(x))),df) } ccf <- function(x, y, lag.max = NULL, type = c("correlation", "covariance"), plot = TRUE, na.action = na.fail, ylab, ...) { type <- match.arg(type) if(is.matrix(x) || is.matrix(y)) stop("univariate time series only") # minka: added as.ts X <- na.action(ts.union(as.ts(x), as.ts(y))) colnames(X) <- c(deparse(substitute(x)), deparse(substitute(y))) acf.out <- acf(X, lag.max = lag.max, plot = FALSE, type = type) lag <- c(rev(acf.out$lag[-1,2,1]), acf.out$lag[,1,2]) y <- c(rev(acf.out$acf[-1,2,1]), acf.out$acf[,1,2]) acf.out$acf <- array(y, dim=c(length(y),1,1)) acf.out$lag <- array(lag, dim=c(length(y),1,1)) acf.out$snames <- paste(acf.out$snames, collapse = " & ") if (plot) { # minka if(missing(ylab)) ylab = paste("Cross",type,sep="-") plot.acf(acf.out, ylab=ylab, ...) return(invisible(acf.out)) } else return(acf.out) } formals(acf)$na.action = na.pass formals(ccf)$na.action = na.pass ############################################################################## growth.chart <- function(y,ref=1,ylab,...) { pct.change = function(x,r) (x-r)/r*100 abs.change = function(x,r) (x-r) y = t(y) if(is.character(ref)) ref = pmatch(ref,rownames(y)) nam = names(dimnames(y)) if(missing(ylab)) ylab = paste("% change since",nam[1],rownames(y)[ref]) if(is.data.frame(y)) y = as.matrix(y) a = attributes(y) r = apply(y,2,function(x) pct.change(x,x[ref])) attributes(r) = a linechart(t(r),ylab=ylab,...) } period.grid <- function(start,p,lty="dotted",labels=T,cex=0.6,col="blue",...) { v = seq(start,par("usr")[2],by=p) abline(v=v,lty=lty,...) if(labels) { i = (v-start)/p + 1 h = par("usr")[4] - yinch(0.03) n = length(v) # place text in center of each interval, except for last one x = c((v[1:(n-1)]+v[2:n])/2, v[n]+xinch(0.03)) text(x,h,i,adj=c(0,1),cex=cex,...) } } format.pretty <- function(x,digits=2) { prettyNum(formatC(x,format="fg",digits=digits),big.mark=",") } cycle.matrix <- function(x,p,extended=F) { # x is a vector # p is cycle length n = length(x) inc = p/ceiling(p) if(ceiling(p) != p) { i = seq(1,n,by=inc) # linear interpolation x = approx(1:n,x,i)$y n = length(x) p = ceiling(p) } d = c(p,ceiling(n/p)) m = array(x[1:prod(d)],d) rownames(m) = format.pretty(seq(1,nrow(m)*inc,len=nrow(m))) colnames(m) = 1:ncol(m) if(extended) { m2 = cbind(m[,2:ncol(m)],NA) m = rbind(m,m2) } names(dimnames(m)) = c("Time","Cycle") m = as.table(m) t(m) } plot.stack <- function(x,y,n=2,...) { xyplot(y ~ x | equal.count(x, n=n, overlap=0), panel = function(x, y) { panel.xyplot(x, y, type="l") }, strip = F, layout = c(1,n), scale = list(x = "free")) } plot.stack <- function(x,y,n=2,ylab="",asp=NULL,...) { real.asp = if(identical(asp,"auto")) auto.aspect(x,y) else asp opar <- par(mfrow=c(n,1),mar=auto.mar(xlab="",ylab=ylab)) on.exit(par(opar)) #b <- equal.count(x,num=n,overlap=0)$intervals b = break.equal(x,n) b = cbind(lag(b),b)[2:(n+1),] for(j in seq(nrow(b))) { i <- (x >= b[j,1]) & (x <= b[j,2]) plot(x[i],y[i],ylab=ylab,asp=real.asp,...) grid(nx=0) } } spiral <- function(object, ...) UseMethod("spiral") spiral.formula <- function(fmla, data=parent.frame(), ..., main) { x = model.frame.default(fmla,data,na.action=na.pass) resp = response.var(x) pred = predictor.vars(x)[1] if(missing(main)) main = resp spiral.default(x[,pred],x[,resp],...,main=main) } spiral.default <- function(x,y,p,offset=0,color.palette=YR.colors(64), clockwise=T, pch.palette=20,bg=gray(0.5),cex=par("cex"), cex.label=0.8,label=T,interp=T,main=NULL,...) { # time flows clockwise by default, unlike other spiral plots dir = if(clockwise) -1 else 1 theta = (x - min(x,na.rm=T) + offset)/p indent = floor(max(theta,na.rm=T)*0.25) theta = theta + indent opar = par(bg=bg,mar=auto.mar(main=main,xlab="",ylab="",xaxt="n",yaxt="n")) on.exit(par(opar)) r = range(theta) # sample uniformly in circumference res = min(par("pin"))/inch.symbol(pch=pch.palette[1],cex=cex)*2.5 theta.g = sqrt(seq(r[1]^2,r[2]^2,len=res*diff(r))) r = theta.g xr = r*cos(2*pi*theta.g*dir) yr = r*sin(2*pi*theta.g*dir) plot(xr,yr,asp=1,type="l",lwd=0.5,col="gray",xaxt="n",yaxt="n",main=main,...) if(label) { spiral.text(theta.g[1],format(x[1]),"inside",clockwise,cex=cex.label) n = length(xr) spiral.text(theta.g[n],format(x[length(x)]),"outside",clockwise,cex=cex.label) } if(interp) { y.g = approx(theta,y,theta.g)$y theta = theta.g y = y.g } nlev = length(color.palette) if(nlev > 1) { col = color.palette[as.numeric(cut.quantile(y,nlev))] pch = pch.palette[1] } else { col = color.palette[1] pch = pch.palette[as.numeric(cut.quantile(y,nlev))] } r = theta xr = r*cos(2*pi*theta*dir) yr = r*sin(2*pi*theta*dir) points(xr,yr,col=col,pch=pch,cex=cex,...) } spiral.text <- function(theta,label,type=c("inside","outside"),clockwise=T,...) { # place text on the spiral f <- function(theta) { theta = theta %% 1 if(theta <= 1/4) theta*4 else if(theta <= 1/2) 1 else if(theta <= 3/4) (3/4 - theta)*4 else 0 } adj.inside <- function(theta) { c(1-f(theta),1-f(theta+1/4)) } adj.outside <- function(theta) { c(f(theta), f(theta+1/4)) } type = match.arg(type) adj = if(type == "inside") adj.inside(theta) else adj.outside(theta) if(!clockwise) adj[2] = 1-adj[2] dir = if(clockwise) -1 else 1 spc = xinch(inch.symbol()) spc = spc*(0.5-adj) r = theta xr = r*cos(2*pi*theta*dir) yr = r*sin(2*pi*theta*dir) text(xr+spc[1], yr+spc[2], label, adj=adj, ...) } # Functions for generating VRML scenes. # The *.wrl functions are internal, not meant to be called from outside. truename <- function(f) { if((substring(f,2,2) == ":") || (substring(f,1,1) == "\\")) return(f) paste(getwd(),f,sep="\\") } vrml.new.name <- function() { assign("vrml.counter",vrml.counter + 1,globalenv()) paste("obj",vrml.counter,sep="") } vrml.color <- function(col) { col2rgb(col)[,1]/255 } vrml.material <- list(diffuseColor=c(1,1,1), emissiveColor=c(0,0,0), specularColor=c(0,0,0), transparency=0,shininess=0.2, ambientIntensity=0.2) vrml.material.wrl <- function(color=vrml.material$diffuseColor, diffuseColor=color, emissiveColor=vrml.material$emissiveColor, transparency=vrml.material$transparency, specularColor=vrml.material$specularColor, shininess=vrml.material$shininess, ambientIntensity=vrml.material$ambientIntensity, ...,file=vrml.file) { cat("Appearance { material Material {",file=file) if(any(emissiveColor != 0)) cat(" emissiveColor",emissiveColor,file=file) if(any(specularColor != 0)) cat(" specularColor",specularColor,file=file) if(any(diffuseColor != 0.8)) cat(" diffuseColor",diffuseColor,file=file) if(transparency != 0) cat(" transparency",transparency,file=file) if(ambientIntensity != 0.2) cat(" ambientIntensity",ambientIntensity,file=file) # shininess does nothing without specularColor if(shininess != 0.2) cat(" shininess",shininess,file=file) cat(" } } ",file=file) } vrml.set.scale <- function(xlim,ylim,zlim,scale=c(1,1,1)) { # sets the globals vrml.offset and vrml.scale, # each vectors of 3 numbers s = scale/c(diff(range(xlim)),diff(range(ylim)),diff(range(zlim))) offset = -c(xlim[1],ylim[1],zlim[1]) assign("vrml.scale",s,globalenv()) assign("vrml.offset",offset,globalenv()) assign("vrml.symbol.radius",0.01,globalenv()) } vrml.to.scale <- function(p) { # p is matrix of three columns if(ncol(p) != 3) stop("p must be a matrix of 3 columns") p = p + rep.row(vrml.offset,nrow(p)) scale.cols(p, vrml.scale) } # http://www.cs.cmu.edu/afs/cs/academic/class/16741-s02/www/lecture6.pdf # http://www.cs.cmu.edu/afs/cs/academic/class/16741-s02/www/lecture7.pdf vrml.quaternion.rotation <- function(rot) { c(cos(rot[4]/2),sin(rot[4]/2)*rot[1:3]) } vrml.rotation.quaternion <- function(q) { qv = q[2:4] if(all(qv == 0)) c(0,1,0,0) else c(qv/norm(qv), 2*atan2(norm(qv),q[1])) } vrml.compose.quaternion <- function(q1,q2) { q1v = q1[2:4] q2v = q2[2:4] c(q1[1]*q2[1] - sum(q1v*q2v),q1[1]*q2v+q2[1]*q1v+vec.cross(q1v,q2v)) } vrml.compose.rotation <- function(rot1,rot2) { q1 = vrml.quaternion.rotation(rot1) q2 = vrml.quaternion.rotation(rot2) q = vrml.compose.quaternion(q1,q2) vrml.rotation.quaternion(q) } vec.cross <- function(p1,p2) { # cross-product of vectors # http://mathworld.wolfram.com/CrossProduct.html cbind(p1[2]*p2[3] - p1[3]*p2[2], p1[3]*p2[1] - p1[1]*p2[3], p1[1]*p2[2] - p1[2]*p2[1]) } vec.angle <- function(p1,p2) { # angle between two vectors acos(sum(p1*p2)/sqrt(sum(p1*p1)*sum(p2*p2))) } vrml.rotate <- function(p,rot) { angle = rot[4] v = rot[1:3] vp = vec.cross(v,p) vvp = vec.cross(v,vp) p + sin(angle)*vp + (1-cos(angle))*vvp } vrml.rotation.between <- function(p1,p2) { # returns a four-element vector specifying an axis-angle rotation # that takes p1 to p2 if(!is.null(dim(p1)) && nrow(p1) > 1) { rot = vrml.rotation.between(p1[1,,drop=F],p2[1,,drop=F]) p1[2,] = vrml.rotate(p1[2,],rot) rot2 = vrml.rotation.between(p1[2,,drop=F],p2[2,,drop=F]) vrml.compose.rotation(rot2,rot) } else { if(all(p1 == 0) || all(p2 == 0)) stop("impossible rotation (to/from the origin)") axis = vec.cross(p1,p2) if(sum(axis*axis) == 0) axis = cbind(0,1,0) c(axis,vec.angle(p1,p2)) } } ############################################################################# vrml.viewpoint.wrl <- function(pos,lookat,file=vrml.file) { default.orient = cbind(0,0,-1) rot = vrml.rotation.between(default.orient,lookat-pos) cat("Viewpoint { position",pos,"orientation",rot,"}\n",file=file) } vrml.file = "" vrml.open <- function(xlim,ylim,zlim,scale,light=F,...,file.name=NULL) { if(is.null(file.name)) file.name = paste(Sys.getenv("TMP"),"Rplot",sep="\\") if(substring(file.name,nchar(file.name)-3,nchar(file.name)) != ".wrl") { file.name = paste(file.name,"wrl",sep=".") } if(nchar(file.name) > 4) { con = file(file.name,"w") } else { con = "" } assign("vrml.file",con,globalenv()) assign("vrml.file.name",file.name,globalenv()) assign("vrml.counter",0,globalenv()) cat("#VRML V2.0 utf8\n",file=vrml.file) #cat("WorldInfo { title \"",match.call()[[1]],"\" }\n",sep="",file=vrml.file) cat("NavigationInfo { type \"EXAMINE\" speed 10 ",file=vrml.file) if(light) cat("headlight FALSE ",file=vrml.file) cat("}\n",file=vrml.file) # white background is easier on the eyes cat("Background { skyColor [1 1 1] }\n",file=vrml.file) cat("DEF black Appearance { material Material {} }\n",file=vrml.file) # camera looks at the center of the back wall #p = cbind(0.5,-1.25,1.1)*scale #lookat = cbind(0.5,1,0.1) p = cbind(0.5,-1.5,0.5)*scale lookat = cbind(0.5,1,0.5) vrml.viewpoint.wrl(p,lookat) vrml.set.scale(xlim,ylim,zlim,scale) } vrml.close <- function() { if(inherits(vrml.file,"connection")) { close(vrml.file) cat("wrote to",vrml.file.name,"\n") shell.exec(truename(vrml.file.name)) } } # baseline specifies the plane in which text is drawn vrml.baseline = rbind(c(1,0,0),c(0,1,0)) vrml.text.wrl <- function(p,txt,adj=c(0,0), baseline=vrml.baseline,cex=1,billboard=F, file=vrml.file) { # if only one vector given, use the default height vector if(nrow(baseline) == 1) baseline = rbind(baseline,vrml.baseline[2,]) rot = vrml.rotation.between(vrml.baseline,baseline) s = cex*vrml.symbol.radius if(adj[1] > 0) { w = strwidth(txt,units="i")*5 p = p - baseline[1,,drop=F]*w*adj[1]*s } if(adj[2] > 0) { h = strheight(txt,units="i")*6 p = p - baseline[2,,drop=F]*h*adj[2]*s } cat("Transform { translation",p[1,],"\n", "rotation",rot,"\n", "scale",rep(s,3),"\n", "children ",file=file) if(billboard) cat("Billboard { children ",file=file) txt = paste("\"",txt,"\"",sep="") cat("Shape { appearance USE black geometry Text { string",txt,"} }",file=file) if(billboard) cat(" } ",file=file) cat("}\n",file=file) } vrml.sphere <- function(x,y,z,cex=1,col=1,...,file=vrml.file) { p = vrml.to.scale(cbind(x,y,z)) r = cex*vrml.symbol.radius vrml.sphere.wrl(p,r,color=vrml.color(col),...,file=file) } vrml.sphere.wrl <- function(p,r,..., file=vrml.file) { if(length(r) == 1) r = rep(r,nrow(p)) for(i in 1:nrow(p)) { cat("Transform { translation",p[i,],"\n", "scale",rep(r[i],3),"\n", "children Shape { ",file=file) if(i == 1) { the.appearance = vrml.new.name() cat("appearance DEF",the.appearance,"",file=file) vrml.material.wrl(...,file=file) the.shape = vrml.new.name() cat("geometry DEF",the.shape,"Sphere {} ",file=file) } else { cat("appearance USE",the.appearance,"",file=file) cat("geometry USE",the.shape,"",file=file) } cat("}\n}\n",file=file) } } vrml.cube <- function(x,y,z,s=c(1,1,1),cex=1,col=1,...,file=vrml.file) { p = vrml.to.scale(cbind(x,y,z)) s = s*vrml.symbol.radius*cex vrml.cube.wrl(p,s,color=vrml.color(col),...,file=file) } vrml.cube.wrl <- function(p,s=c(1,1,1),..., file=vrml.file) { if(length(s) == 1) s = rep(s,3) p = na.omit(p) omitted = attr(p,"na.action") if(is.null(dim(s)) || nrow(s)==1) s = rep.row(s,nrow(p)) else if(!is.null(omitted)) s = s[omitted,] for(i in 1:nrow(p)) { cat("Transform { translation",p[i,],"\n", "scale",s[i,],"\n", "children Shape { ",file=file) if(i == 1) { the.appearance = vrml.new.name() cat("appearance DEF",the.appearance,"",file=file) vrml.material.wrl(...,file=file) the.shape = vrml.new.name() cat("geometry DEF",the.shape,"Box {} ",file=file) } else { cat("appearance USE",the.appearance,"",file=file) cat("geometry USE",the.shape,"",file=file) } cat("}\n}\n",file=file) } } vrml.cylinder.wrl <- function(p1,p2,r=vrml.symbol.radius,file=vrml.file) { midp = (p1+p2)/2 dp = p2-p1 h = sqrt(sum(dp*dp)) rot = vrml.rotation.between(cbind(0,1,0),dp) cat("Transform { translation",midp[1,],"\n", "rotation",rot,"\n", "children Shape { geometry Cylinder { radius",r,"height",h,"} }\n", "}\n", file=file) } vrml.axes <- function(xlim,ylim,zlim,xlab=NULL,ylab=NULL,zlab=NULL,cex=12,..., file=vrml.file) { # origin po = vrml.to.scale(cbind(xlim[1],ylim[1],zlim[1])) vrml.cube.wrl(po,vrml.symbol.radius,...,file=file) px = vrml.to.scale(cbind(xlim[2],ylim[1],zlim[1])) py = vrml.to.scale(cbind(xlim[1],ylim[2],zlim[1])) pz = vrml.to.scale(cbind(xlim[1],ylim[1],zlim[2])) if(F) { # cylinders for axes vrml.cylinder.wrl(po,px,file=file) vrml.cylinder.wrl(po,py,file=file) vrml.cylinder.wrl(po,pz,file=file) } else { vrml.line.wrl(po,px,file=file) vrml.line.wrl(po,py,file=file) vrml.line.wrl(po,pz,file=file) } # axis names mar = vrml.symbol.radius if(!is.null(xlab)) { vrml.text.wrl(px/2-pz*mar,xlab,cex=cex,adj=c(0.5,1),base=rbind(px,pz),file=file) #vrml.text.wrl(px/2-py*mar+pz,xlab,cex=cex,adj=c(0.5,1),base=px,file=file) #vrml.text.wrl(px/2-py*mar,xlab,cex=cex,adj=c(0.5,1),base=-px,file=file) } if(!is.null(ylab)) { vrml.text.wrl(py/2-pz*mar,ylab,cex=cex,adj=c(0.5,1),base=rbind(py,pz),file=file) #vrml.text.wrl(py/2-px*mar+pz,ylab,cex=cex,adj=c(0.5,1),base=py,file=file) #vrml.text.wrl(py/2-px*mar,ylab,cex=cex,adj=c(0.5,1),base=-py,file=file) } if(!is.null(zlab)) { vrml.text.wrl(pz/2-px*mar,zlab,cex=cex,adj=c(0.5,0),base=rbind(pz,-px),file=file) #vrml.text.wrl(pz/2-py*mar+px,zlab,cex=cex,adj=c(0.5,1),base=-pz,file=file) #vrml.text.wrl(pz/2-py*mar,zlab,cex=cex,adj=c(0.5,1),base=pz,file=file) } } vrml.line.wrl <- function(p1,p2,file=vrml.file) { cat("Shape { appearance USE black geometry IndexedLineSet {\n", "coord Coordinate {\n", "point [",t(rbind(p1,p2)),"]\n }\n", "coordIndex [",c(0,1,-1),"]\n}}\n", file=file) } vrml.box.wrl <- function(p,file=vrml.file) { points = array(0,c(8,3)) g = data.matrix(expand.grid(1:2,1:2,1:2)) for(i in 1:nrow(g)) { for(j in 1:3) points[i,j] = p[g[i,j],j] } index = c(0,1,5,5,4,0,-1,2,3,7,6,2,-1, 0,2,-1,1,3,-1,5,7,-1,4,6,-1) cat("Shape { appearance USE black geometry IndexedLineSet {\n", "coord Coordinate {\n", "point [",t(points),"]\n }\n", "coordIndex [",index,"]\n}}\n", file=file) } vrml.box <- function(xlim,ylim,zlim,file=vrml.file) { vrml.box.wrl(vrml.to.scale(cbind(xlim,ylim,zlim)),file=file) } vrml.plot3d <- function(object, ...) UseMethod("vrml.plot3d") vrml.plot3d.formula <- function(formula,data=parent.frame(),...,xlab,ylab,zlab) { x <- model.frame.default(formula,data,na.action=na.pass) resp = response.var(x) pred = predictor.vars(x) if(missing(xlab)) xlab = pred[1] if(missing(ylab)) ylab = pred[2] if(missing(zlab)) zlab = resp vrml.plot3d.default(x[,pred[1]],x[,pred[2]],x[,resp], xlab=xlab,ylab=ylab,zlab=zlab,...) } vrml.plot3d.default <- function(x,y,z,data=parent.frame(),xlab,ylab,zlab, pch=2,col=3,scale=c(1,1,1),file.name=NULL, cex=1,cex.axis=12,light=T,...) { if(missing(xlab)) xlab <- deparse(substitute(x)) if(missing(ylab)) ylab <- deparse(substitute(y)) if(missing(zlab)) zlab <- deparse(substitute(z)) x <- eval(substitute(x),data) y <- eval(substitute(y),data) z <- eval(substitute(z),data) xlim = range(x,na.rm=T); ylim = range(y,na.rm=T); zlim = range(z,na.rm=T) xlim = extend.pct(xlim); ylim = extend.pct(ylim); zlim = extend.pct(zlim) vrml.open(xlim,ylim,zlim,scale,light,file.name=file.name,...) if(light) { cat("PointLight { ambientIntensity 0.5 intensity 0.25 location 0 0 1 }\n", file=vrml.file) cat("PointLight { intensity 0.25 location 1 1 1 }\n", file=vrml.file) cat("PointLight { intensity 0.25 location 0 1 1 }\n", file=vrml.file) cat("PointLight { intensity 0.25 location 1 0 1 }\n", file=vrml.file) } cat("Group { children [\n",file=vrml.file) vrml.box(xlim,ylim,zlim) vrml.axes(xlim,ylim,zlim,xlab,ylab,zlab,cex=cex.axis,...) cat("]}\n",file=vrml.file) cat("Group { children [\n",file=vrml.file) if(pch==1) { vrml.sphere(x,y,z,col=col,cex=cex,...) } else { vrml.cube(x,y,z,col=col,cex=cex,...) } cat("]}\n",file=vrml.file) vrml.close() } vrml.surface <- function(object,...) UseMethod("vrml.surface") vrml.surface.loess <- function(object,res=20, xlim,ylim,zlim,clip=T,xlab,ylab,zlab,...) { if(length(res) == 1) res <- rep(res,2) x <- model.frame(object) resp <- response.var(object) pred <- predictor.vars(object) if(missing(xlim)) xlim <- range(x[[pred[1]]],na.rm=T) x1 <- seq(xlim[1],xlim[2],length=res[1]) if(missing(ylim)) ylim <- range(x[[pred[2]]],na.rm=T) x2 <- seq(ylim[1],ylim[2],length=res[2]) xt <- expand.grid(x1,x2) names(xt) <- pred[1:2] z <- predict(object,xt) if(!missing(zlim)) { z[z < zlim[1]] <- zlim[1] z[z > zlim[2]] <- zlim[2] } if(clip != F) { if(clip == T) { i <- chull(x[pred]) clip <- x[pred][i,] } z[!in.polygon(clip,xt)] <- NA } dim(z) <- c(length(x1),length(x2)) if(missing(xlab)) xlab <- pred[1] if(missing(ylab)) ylab <- pred[2] if(missing(zlab)) zlab <- resp vrml.surface.default(x1,x2,z,xlab=xlab,ylab=ylab,zlab=zlab,...) } vrml.surface.default <- function(x,y,z,xlab=NULL,ylab=NULL,zlab=NULL, col="gray",scale=c(1,1,1),file.name=NULL, cex=1,cex.axis=12,light=F,...) { if(is.null(xlab)) xlab <- deparse(substitute(x)) if(is.null(ylab)) ylab <- deparse(substitute(y)) if(is.null(zlab)) zlab <- deparse(substitute(z)) #x <- eval(substitute(x),data) #y <- eval(substitute(y),data) #z <- eval(substitute(z),data) xlim = range(x,na.rm=T); ylim = range(y,na.rm=T); zlim = range(z,na.rm=T) xlim = extend.pct(xlim); ylim = extend.pct(ylim); zlim = extend.pct(zlim) vrml.open(xlim,ylim,zlim,scale,light,file.name=file.name,...) if(light) { # dir 1 1 -1 cat("DirectionalLight { ambientIntensity 0 intensity 1", "direction 1 1 -0.1 }\n",file=vrml.file) } cat("Group { children [\n",file=vrml.file) vrml.box(xlim,ylim,zlim) vrml.axes(xlim,ylim,zlim,xlab,ylab,zlab,cex=cex.axis,...) cat("]}\n",file=vrml.file) cat("Group { children [\n",file=vrml.file) vrml.surface.wrl(x,y,z,color=vrml.color(col),...) cat("]}\n",file=vrml.file) vrml.close() } na.dummy <- function(x) { # replace NAs with a dummy value if(is.data.frame(x)) return(apply.df(x,na.dummy)) x[which(is.na(x))] = 666 x } vrml.surface.wrl <- function(x,y,z,border=F, creaseAngle=10,...,file=vrml.file) { # turn off LOD in the viewer or you will get weird effects points = expand.grid(x,y) points = cbind(points,as.vector(z)) points = vrml.to.scale(points) face = 0 nx = length(x) ny = length(y) index = array(0,c((nx-1)*(ny-1)*2,4)) for(i in 0:(ny-2)) { for(j in 0:(nx-2)) { face = face + 1 index[face,] = c(i*nx + j+1, i*nx + j, (i+1)*nx + j, -1) if(any(is.na(points[index[face,1:3]+1,3]))) face = face - 1 face = face + 1 index[face,] = c(i*nx + j+1, (i+1)*nx + j, (i+1)*nx + j+1, -1) if(any(is.na(points[index[face,1:3]+1,3]))) face = face - 1 } } index = index[1:face,] cat("Shape { appearance ",file=file) vrml.material.wrl(...,file=file) cat("geometry IndexedFaceSet { solid FALSE creaseAngle",creaseAngle,"\n", "coord Coordinate {\n", "point [",t(na.dummy(points)),"]\n }\n", "coordIndex [",t(index),"]\n}}\n",file=file) if(border) { points[,3] = points[,3] + 4e-3 index = array(0,c((nx-1)*(ny-1),5)) face = 0 for(i in 0:(ny-2)) { for(j in 0:(nx-2)) { face = face + 1 index[face,] = c(i*nx + j+1, i*nx + j, (i+1)*nx + j, (i+1)*nx + j+1, -1) if(any(is.na(points[index[face,1:4]+1,3]))) face = face - 1 } } index = index[1:face,] cat("Shape { appearance ",file=file) # only emissiveColor is used vrml.material.wrl(emissiveColor=vrml.color(1),file=file) cat("geometry IndexedLineSet {", "coord Coordinate {\n", "point [",t(na.dummy(points)),"]\n }\n", "coordIndex [",t(index),"]\n}}\n",file=file) } } vrml.array.wrl <- function(m,file=vrml.file) { if(is.data.frame(m)) m = data.matrix(m) for(i in 1:nrow(m)) { cat(m[i,],", ",file=file) } }