5.5 Using cross-fitting to predict propensity score
Here, we will be using 10-fold cross-folding to predict propensity score.
<- function(predictfold){
fun_probit_predict # @Arg predictfold: number of the fold to avoid for model traning
# but used for prediction
<- train(
cv_model1 ~ X1 + X2 + X3 + X4,
W data = dat[-predictfold, ],
method = "glm",
family = "binomial",
trControl = trainControl(method = "cv", number = 10)
)
<- predict(cv_model1, dat[predictfold, ], type = "prob")
predict_logit return(predict_logit[, 2])
}
##############################
#
# cross-fitting
#
##############################
<- 10 # number of folds
k <- nrow(dat)
len
<- sample(1:len, replace = FALSE, size = len)
ind <- cut(1:len, breaks = k, labels = FALSE) # create 10 folds
fold
<- fold[ind] # randomly allocate the folds by ind
fold
# container to store the predicted values
<- c()
store <- c()
true_index
# do the cross-fitting and store
for(i in 1:k){
# which(fold == i) is used as an index, if 8th observation receives the 1st fold for the first time,
# then the 1st prediction value corresponds to the 8th obs
<- fun_probit_predict(predictfold = which(fold == i))
store_new <- as.numeric(as.character(store_new))
store_new <- which(fold == i)
true_index_new <- c(store, store_new)
store <- c(true_index, true_index_new)
true_index
}
# create a dataframe with index that maps the predictions with the actual data
<- data.frame(pscore = store, index = true_index)
store
# sort by index
<- store[order(store[, 2]), ]
store
# propensity score
<- dat %>%
dat mutate(pscore = store$pscore)
# histogram of propensity score
hist(dat$pscore, main = "propensity score \n from cross-fitting")