I am having a problem using the predict()
function for a mgcv::gam
(training) model on a new (testing) dataset. The problem arises due to a mrf
smooth I have integrated to account for the spatial nature of my data.
I use the following call to create my GAM model
## Run GAM with MRF
m <- gam(crime ~ s(district,k=nrow(traindata),
bs ='mrf',xt=list(nb=nbtrain)), #define MRF smooth
data = traindata,
method = 'REML',
family = scat(), #fit scaled t distribution
gamma = 1.4
)
where I predict the dependent variable crime
using the neighbourhood structure, parsed into the model in the smooth term argument xt
. The neighbourhood structure comes as a nb
object that I created using the poly2nb()
function.
Now, if I want to use predict()
on a new testing dataset, I don't know how to pass the according neighbourhood structure into the call. Providing just the new data
pred <- predict.gam(m,newdata=testdata)
throws the following error:
Error in predict.gam(m, newdata = testdata) :
7, 16, 20, 28, 35, 36, 37, 43 not in original fit
Here's a full reproduction of the error using the Columbus dataset called from within R directly:
#ERROR REPRODUCTION
## Load packages
require(mgcv)
require(spdep)
require(dplyr)
## Load Columbus Ohio crime data (see ?columbus for details and credits)
data(columb.polys) #Columbus district shapes list
columb.polys <- lapply(columb.polys,na.omit) #omit NAs (unfortunate problem with the Columbus sample data)
data(columb) #Columbus data frame
df <- data.frame(district=numeric(0),x=numeric(0),y= numeric(0)) #Create empty df to store x, y and IDs for each polygon
## Extract x and y coordinates from each polygon and assign district ID
for (i in 1:length(columb.polys)) {
district <- i-1
x <- columb.polys[[i]][,1]
y <- columb.polys[[i]][,2]
df <- rbind(df,cbind(district,x,y)) #Save in df data.frame
}
## Convert df into SpatialPolygons
sp <- df %>%
group_by(district) %>%
do(poly=select(., x, y) %>%Polygon()) %>%
rowwise() %>%
do(polys=Polygons(list(.$poly),.$district)) %>%
{SpatialPolygons(.$polys)}
## Merge SpatialPolygons with data
spdf <- SpatialPolygonsDataFrame(sp,columb)
## Split into training and test sample (80/20 ratio)
splt <- sample(1:2,size=nrow(spdf),replace=TRUE,prob=c(0.8,0.2))
train <- spdf[splt==1,]
test <- spdf[splt==2,]
## Prepapre both samples and create NB objects
traindata <- train@data #Extract data from SpatialPolygonsDataFrame
testdata <- test@data
traindata <- droplevels(as(train, 'data.frame')) #Drop levels
testdata <- droplevels(as(test, 'data.frame'))
traindata$district <- as.factor(traindata$district) #Factorize
testdata$district <- as.factor(testdata$district)
nbtrain <- poly2nb(train, row.names=train$Precinct, queen=FALSE) #Create NB objects for training and test sample
nbtest <- poly2nb(test, row.names=test$Precinct, queen=FALSE)
names(nbtrain) <- attr(nbtrain, "region.id") #Set region.id
names(nbtest) <- attr(nbtest, "region.id")
## Run GAM with MRF
m <- gam(crime ~ s(district, k=nrow(traindata), bs = 'mrf',xt = list(nb = nbtrain)), # define MRF smooth
data = traindata,
method = 'REML', # fast version of REML smoothness selection; alternatively 'GCV.Cp'
family = scat(), #fit scaled t distribution
gamma = 1.4
)
## Run prediction using new testing data
pred <- predict.gam(m,newdata=testdata)
SOLUTION:
I finally found the time to update this post with the solution. Thanks to everyone for helping me out. Here is the code for implementing k-fold CV with a random training-testing split:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = data,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Generate test dataset
testdata <- data[data$weight==0,] #Select test data by weight
#Predict test data
pred <- predict(m,newdata=testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)
I have one question criticism with the current solution, being that the full dataset was used to "train" the model meaning that the predictions are going to be biased since the testdata was used to train it.
This only requires a couple minor tweaks to fix:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
#For loop for each fold
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Generate training dataset
trainingdata <- data[data$weight == 1, ] #Select test data by weight
#Generate test dataset
testdata <- data[data$weight == 0, ] #Select test data by weight
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = trainingdata,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Predict test data
pred <- predict(m,newdata = testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
#Get average scores from each k-fold test
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With