Log Related to Density Estimation & Nonparametric Regression ================================= 4/28/04 updated 11/30/2009 ## Examples use "galaxies" data (in web-page ## data-directory) along with "geyser" ## data supplied in standard Splus6.0 ## and in library "MASS" within R Selected statements/functions for class use. -------------------------------------------- ### > galaxies = scan("galaxies.dat", skip=3)/1000 > galaxies = galaxies/1000 > summary(galaxies) Min. 1st Qu. Median Mean 3rd Qu. Max. 9.172 19.53 20.83 20.83 23.13 34.28 > hist(galaxies, nclass=16, prob=T, xlim=c(4,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Gaussian Window, Default") > dens1 = density(galaxies) lines(dens1) > hist(galaxies, nclass=16, prob=T, xlim=c(4,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Rectang Window, Default") lines(density(galaxies,window="r")) > pts <- dens1\$x kerMat <- outer(pts, galaxies, function(x,y) dlogis(x,y,.2) > dim(kerMat) [1] 50 82 > dens3 = cbind(pts, c(kerMat %*% rep(1/82,82))) === Next a little bit on cross-validated bandwidth selection === WHAT WE DISCUSS HERE IS "LEAST-SQUARES CROSS-VALIDATION". More details can be found in: W Haerdle, SMOOTHING TECHNIQUES WITH IMPLEMENTATIION IN S (Springer-Verlag book, 1991), Sec 4.3.2 Scott, D. and Terrell, G. (1987) JASA v.82, p.1131 article. > LgisCV <- function(b, Pts) { ### little function to evaluate least-sq error criterion ### for galaxies data with bin-width b kertmp <- outer(Pts, galaxies, function(x,y,B) dlogis(x,y,B), B=b) denstmp <- cbind(Pts, c(kertmp %*% rep(1/82,82))) delta <- c(range(Pts) %*% c(-1,1))/(length(Pts)-1) intsq <- delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+ denstmp[length(Pts),2]^2)) ### The numerical integration here uses trapezoid rule. penalty <- numeric(82) for(i in 1:82) penalty[i] <- mean( dlogis(galaxies[i],galaxies[-i],b)) intsq - 2*mean(penalty) } ### NOTE: minimizing this function gives a ### "Cross-Validated Bandwidth" b . > LgisCV(.2,pts) -0.0998805 > LgisCV(.5,pts) -0.1042199 > LgisCV(.8,pts) -0.09922277 > optimize(LgisCV, c(.1,1.5), Pts=pts)[1:2] \$minimum: [1] 0.3783803 \$objective: [1] -0.1052062 ### Thus the optimized bandwidth is 0.3784: > lines(pts, c(outer(pts, galaxies, function(x,y) dlogis(x,y, 0.3784)) %*% rep(1/82,82)), lty=4) ### Adding this curve to either of the Gaussian or ### Rectangular default plots above shows a really ### good job of fitting !! ------------------------------------------------------------ ### Next: spline-based estimate, using all points as "knots". ### First get spline estimate of cdf, then density using ### deriv=1 in "predict" applied to smooth.spline object. > pts = density(galaxies)\$x[(1:50)*10] > rnkx = numeric(50) for (i in 1:50) rnkx[i] = sum(galaxies<= pts[i])/82 > splgalx = smooth.spline(pts, rnkx, spar=.4) > plot(pts,rnkx) lines(pts, predict(splgalx,pts)\$y) > par(mfrow=c(2,2)) spars = c(.1, .3, .6, .9) for(i in 1:4) { hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= paste("Spline Fit, spar=",spars[i],sep="")) splgalx = smooth.spline(pts, rnkx, spar=spars[i]) lines(pts, predict(splgalx,pts,1)\$y) } ### can also use "postscript" or "pdf" command to save ### graphs produced in this way, but on separate pages ### quit and produce graphs with final command dev.off(). ----------------------------------------------------------- ### Now let's compare with mixture of logistics: > lLk = function(x, pwt, mn, scal, dens = dlogis) { invsc = 1/scal dnsmat = invsc*matrix(dens(invsc* outer(mn,x,"-")), ncol=length(x)) sum(log(pwt %*% dnsmat)) } > lLk(galaxies, c(.5,.5),c(20,30),c(2,2)) ## -268.8732 > tmpft = nlm(function(w) -lLk(galaxies,c(plogis(w[1]), 1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0,20,30,2,2), print.level=1, iterlim=100)iteration = 0 Step: [1] 0 0 0 0 0 Parameter: [1] 0 20 30 2 2 Function Value [1] 296.6142 Gradient: [1] -6.6289576 -0.1329425 2.2508516 39.7330157 11.2657407 iteration = 23 Parameter: [1] 1.1663503 21.2954749 18.7628547 0.1574718 1.6160380 Function Value [1] 222.5247 Gradient: [1] 8.795656e-07 -2.276215e-06 -1.631197e-07 -5.599645e-06 6.281384e-06 Relative gradient close to zero. Current iterate is probably solution. > tmpft\$est 1] 1.1663496 21.2954746 18.7628558 0.1574711 1.6160374 ### So assign mean 21.295 with weight plogis(1.1663) = 0.762 ============================================================ Remainder of this log has selected Nonparametric Regression and Smoothing steps for class 4/30/04, updated 12/2/09 ---------------------------------------------------------- > gfram = cbind.data.frame(Wait=geyser\$waiting, Lgth=geyser\$duration) plot(gfram[,1], gfram[,2], xlab="Wait", ylab="Lgth", main= "Scatterplot of Geyser Data with Lowess, n=299") > for(i in 1:4) abline(v=42+i*12, lty=3) for(i in 1:5) { inds = (1:299)[abs(gfram\$Wait-42-12*i+6) < 6] itmp = order(gfram\$Wait[inds]) lines(gfram\$Wait[inds[itmp]], lm(Lgth ~ Wait, data=gfram[ inds,])\$fit[itmp], lty=6) } > tmplow <- lowess(gfram\$Wait,gfram\$Lgth) lines(tmplow\$x,tmplow\$y, lty=4) ### Now kernel-based nonparametric-regression fit: ### Take expectations near each of 50 evenly spaced points > xp = seq(40,110, length=50) kermat = outer(xp, gfram\$Wait, function(x,y) dnorm(x,y,1)) ### Our bandwidth is 1, using Gaussian kernel. ### Now create conditional expectation using kernel as weights > cexp <- c((kermat %*% gfram\$Lgth)/kermat %*% rep(1,299)) points(xp, cexp, pch=5) > tmpsp = smooth.spline(gfram\$Wait,gfram\$Lgth, all.knots=F, spar=.4) lines(tmpsp\$x, tmpsp\$y, lty=1) legend(locator(), legend=c("Piecewise linear","Lowess","Spline"), lty=c(6,4,1)) text(locator(),paste("Hollow diamonds are \n", "kernel-regression points")) > printgraph(file="geyserNPR.ps") ### changed in R !!! ----- Remains to use Cross-Validation to check -------------- Predictive Success of all of these modelling Strategies ! Method: leave 30 points out (at random), fit using the rest by all of the NPR methods (lowess, spline, kernel-regression). Repeat 1000 times to get averaged mean-square prediction error per obs ! ## Let's try to do cross-validation using only spline and kernel regression, since with lowess or the related function loess.smooth, we do not automatically have smoothed-function evaluations at newly specified points. (The documentation just suggests to use "approx" for lowess fits, or linear interpolation, to get smoothed-function approximations at new points.) > splerr = kererr = numeric(100) for (i in 1:100) { leftout = sample(299, 30) leftin = setdiff(1:299,leftout) tmpsp = smooth.spline(gfram\$Wait[leftin],gfram\$Lgth[leftin], all.knots=F, spar=0.4) splpred = predict(tmpsp, gfram\$Wait[leftout])\$y kermat = outer(gfram\$Wait[leftout], gfram\$Wait[leftin], function(x,y) dnorm(x,y,1)) yexp = c((kermat %*% gfram\$Lgth[leftin])/ kermat %*% rep(1,269)) splerr[i] = mean((splpred-gfram\$Lgth[leftout])^2) kererr[i] = mean((yexp-gfram\$Lgth[leftout])^2) } > summary(splerr) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.3695 0.6809 0.7945 0.8081 0.9059 1.3180 > length(splerr) [1] 100 > summary(kererr) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.3688 0.6761 0.7842 0.7856 0.8897 1.1940 ### This comparison seems to show that overall, the kernel-density estimator is doing a little better than the spline. Maybe we can broaden the comparison a little by trying the spline again ,with smoothing parameter a little larger, and the kernel-density with a bandwidth optimized by least-squares cross-validation for the x-coordinate data alone ... > LgisCV = function(b, Pts, datavec) { ### little function to evaluate least-sq error criterion ### for galaxies data with bin-width b nd = length(datavec) kertmp = outer(Pts, datavec, function(x,y,B) dlogis(x,y,B), B=b) denstmp = cbind(Pts, c(kertmp %*% rep(1/nd,nd))) delta = c(range(Pts) %*% c(-1,1))/(length(Pts)-1) intsq = delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+ denstmp[length(Pts),2]^2)) ### The numerical integration here uses trapezoid rule. penalty = numeric(nd) for(i in 1:nd) penalty[i] = mean(dlogis(datavec[i],datavec[-i],b)) intsq - 2*mean(penalty) } > optimize(LgisCV, c(.1,2), Pts=45:105, datavec=gfram\$Wait)[1:2] \$minimum: [1] 1.25539 \$objective: [1] -0.02346868 > splerr2 = kererr2 = numeric(100) for (i in 1:100) { leftout = sample(299, 30) leftin = setdiff(1:299,leftout) tmpsp = smooth.spline(gfram\$Wait[leftin],gfram\$Lgth[leftin], all.knots=F, spar=0.6) splpred = predict(tmpsp, gfram\$Wait[leftout])\$y kermat = outer(gfram\$Wait[leftout], gfram\$Wait[leftin], function(x,y) dnorm(x,y,1.2554)) yexp = c((kermat %*% gfram\$Lgth[leftin])/ kermat %*% rep(1,269)) splerr2[i] = mean((splpred-gfram\$Lgth[leftout])^2) kererr2[i] = mean((yexp-gfram\$Lgth[leftout])^2) } > rbind(summary(splerr),summary(kererr), summary(splerr2), summary(kererr2)) Min. 1st Qu. Median Mean 3rd Qu. Max. [1,] 0.3695 0.6809 0.7945 0.8081 0.9059 1.318 [2,] 0.3688 0.6761 0.7842 0.7856 0.8897 1.194 [3,] 0.4380 0.6883 0.7976 0.7893 0.8768 1.187 [4,] 0.4487 0.7124 0.7942 0.8059 0.9052 1.208 ### Of these methods, the cross-validated-bandwidth kernel NPR methods ### seems to be best, but the cross-validated bandwidth choice was ### not precisely designed to do best with respect to the task at hand ### (predicting y's from smoothed conditional expectations given x) ### and it didn't !! ### "Cross-validated" bandwidth selection could be done with criterion ### equal to the mean squared prediction error, and then by ### definition it would be best, but one needs sophisticated large ### sample theory to say whether it is enough better to be ### worthwhile ...