5.5 Using cross-fitting to predict propensity score
Here, we will be using 10-fold cross-folding to predict propensity score.
fun_probit_predict <- function(predictfold){
# @Arg predictfold: number of the fold to avoid for model traning
# but used for prediction
cv_model1 <- train(
W ~ X1 + X2 + X3 + X4,
data = dat[-predictfold, ],
method = "glm",
family = "binomial",
trControl = trainControl(method = "cv", number = 10)
)
predict_logit <- predict(cv_model1, dat[predictfold, ], type = "prob")
return(predict_logit[, 2])
}
##############################
#
# cross-fitting
#
##############################
k <- 10 # number of folds
len <- nrow(dat)
ind <- sample(1:len, replace = FALSE, size = len)
fold <- cut(1:len, breaks = k, labels = FALSE) # create 10 folds
fold <- fold[ind] # randomly allocate the folds by ind
# container to store the predicted values
store <- c()
true_index <- c()
# do the cross-fitting and store
for(i in 1:k){
# which(fold == i) is used as an index, if 8th observation receives the 1st fold for the first time,
# then the 1st prediction value corresponds to the 8th obs
store_new <- fun_probit_predict(predictfold = which(fold == i))
store_new <- as.numeric(as.character(store_new))
true_index_new <- which(fold == i)
store <- c(store, store_new)
true_index <- c(true_index, true_index_new)
}
# create a dataframe with index that maps the predictions with the actual data
store <- data.frame(pscore = store, index = true_index)
# sort by index
store <- store[order(store[, 2]), ]
# propensity score
dat <- dat %>%
mutate(pscore = store$pscore)
# histogram of propensity score
hist(dat$pscore, main = "propensity score \n from cross-fitting")