Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Stopping extrapolation when predicting using two variables

Tags:

r

ggplot2

After model selection, my best model has two continuous variables with an interaction term.

My data looks like this:

df<-structure(list(Height_RR = c(1.09861228866811, -0.143100843640673, 
                             -0.990398704027877, -0.990398704027877, 0.0800427076735365, 0.916290731874155, 
                             1.04982212449868, 0.430782916092454, 0.575364144903562, 0.0953101798043248, 
                             -0.405465108108164, 0.132489183046079, 0.117783035656383, 1.04982212449868, 
                             -0.916290731874155, 1.20397280432594, -0.356674943938732, 0.287682072451781, 
                             -0.356674943938732, 0.287682072451781, -0.356674943938732, 0.287682072451781, 
                             -0.318387168583869, -0.318387168583869, -0.318387168583869, -0.356674943938732, 
                             -0.393042588109607, -0.485507815781701, -0.405465108108164, -0.441832752279039, 
                             0.42744401482694, 0.510825623765991, 1.04982212449868, 0.847297860387204, 
                             -0.356674943938732), 
               CWD = c(-173.850331373113, -1063.65743501896, 
               -1152.77456298827, -1200.14062275391, -1191.1454796875, -154.366681649412, 
              -160.237513231871, -148.917478721144, -655.155005728001, -1081.17124999999, 
              -192.437500000001, -664.538976252586, -540.755930464004, -230.945841787738, 
               -632.705104201392, -159.376784299775, -686.792144988283, -686.792144988283, 
               -686.792144988283, -686.792144988283, -686.792144988283, -686.792144988283, 
               -681.113600988285, -681.113600988285, -681.113600988285, -203.026923561875, 
               -32.6625046554887, -499.426709860029, -190.121159505859, -871.942483370128, 
              -988.569593505865, -248.140000031999, -165.422048285183, -165.422048285183, 
                -165.422048285183), 
              yi = c(0.201673327454905, 0.374187870755073, 
              -1.26459742614237, -3.4162352590267, -1.20397280432594, 0.943371690476826, 
              1.26291533995939, 0.551099522903931, 0.506212624114382, 1.62113395219729, 
              -0.272257602126149, 0.345151698858396, 0.104290175606367, 4.56284362374715, 
              0.532233017860013, 0.697137826765664, -1.01160091167848, -1.07044141170141, 
              0.646627164925052, -0.519875459285909, -0.646627164925052, -0.519875459285909, 
              -0.424883193965266, 0.074107972153722, 0.074107972153722, 0.693147180559945, 
              0.0707340508073709, 0.358554516423803, -0.682820066404096, -0.638455438611528, 
              0.410070448690588, 1.11317243999944, 1.26976054486394, 1.45111186119636, 
              1.44238382777093)), .Names = c("Height_RR", "CWD", "yi"), row.names = c(1L, 
              2L, 3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
              17L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 
              31L, 33L, 34L, 35L, 36L, 37L, 38L), class = "data.frame")

And my model parameter estimates look like this:

                 estimate  se     zval    pval    ci.lb     ci.ub    
intrcpt          1.1422  0.3862   2.9575  0.0031   0.3852   1.8991  **
Height_RR        0.3412  0.1488   2.2925  0.0219   0.0495   0.6328   *
CWD              0.0014  0.0006   2.2698  0.0232   0.0002   0.0026   *
Height_RR:CWD   -0.0006  0.0002  -3.0161  0.0026  -0.0009  -0.0002  **

I want to display predictions from this result using a figure where both x and y axes are the continuous predictor variables and the predicted value is represented by a colour ramp. Using this code, I get the following figure

library(ggplot2)
new.data<-expand.grid(Height_RR=seq(min(df$Height_RR),max(df$Height_RR),length.out = 20),
                  CWD=seq(min(df$CWD,na.rm = T),max(df$CWD,na.rm = T),length.out = 20))

new.data$yi<-(new.data$Height_RR*0.3412)+1.1422+(0.0014*new.data$CWD)+((new.data$CWD*new.data$Height_RR)*0.0006)


ggplot(new.data,aes(x=Height_RR,y=CWD,fill=yi))+geom_raster()+scale_fill_continuous(low="light grey",high="black")

enter image description here

However, I don't have data for all regions of this plot and thus I am extrapolating for this model. The minimum convex hull and points on the figure below shows where my data is

enter image description here

My questions are:

  1. Is there a way to create a dataframe to limit my predictions to the region I have data for (i.e. within the polygon)?
  2. If not, is there a better way to display interactions between two continuous variables?
like image 801
Phil_Martin Avatar asked Mar 17 '16 10:03

Phil_Martin


1 Answers

Here's an idea, I hope I got it right..

# Create a convex hull of the points in df and turn that into a polygon of class SpatialPolygons:

    ch <- chull(df$Height_RR, df$CWD)
    poly.df <- df[c(ch, ch[1]),]
    poly <- SpatialPolygons(list(Polygons(list(Polygon(as.matrix(poly.df[,1:2]))),1)))


# Create a SpatialPointsDataFrame with new.data:

    library(sp)
    new.data.poly <- new.data
    coordinates(new.data.poly) <- ~Height_RR+CWD

# Extract the points in new.data which are covered by the polygon:

    new.data$inp <- over(new.data.poly, poly)
    new.data <- new.data[complete.cases(new.data),]

# Calculate yi as you did:

    new.data$yi<-(new.data$Height_RR*0.3412) + 1.1422 + (0.0014*new.data$CWD) + ((new.data$CWD*new.data$Height_RR)*0.0006)

# Plot:    
    ggplot(new.data, aes(x=Height_RR,y=CWD,fill=yi)) +
    geom_raster() + 
    scale_fill_continuous(low="light grey",high="black")

enter image description here

like image 195
erc Avatar answered Oct 14 '22 11:10

erc