7.5 An example of causal forest

rm(list = ls())
library(devtools)
#devtools::install_github("grf-labs/grf", subdir = "r-package/grf")
library(grf)
library(ggplot2)


# generate data
n <- 2000
p <- 10
X <- matrix(rnorm(n * p), n, p)
X.test <- matrix(0, 101, p)
X.test[, 1] <- seq(-2, 2, length.out = 101)

# Train a causal forest.
W <- rbinom(n, 1, 0.4 + 0.2 * (X[, 1] > 0))
Y <- pmax(X[, 1], 0) * W + X[, 2] + pmin(X[, 3], 0) + rnorm(n)

# Train a causal forest 

c.forest  <-  causal_forest(X, Y, W)

# predict using the training data using out-of-bag prediction
tau.hat.oob  <- predict(c.forest)
hist(tau.hat.oob$predictions)

# Estimate treatment effects for the test sample 
tau.hat  <- predict(c.forest, X.test)
plot(X.test[, 1], tau.hat$predictions, ylim = range(tau.hat$predictions, 0, 2), xlab = "x", ylab = "tau", type = "l")
lines(X.test[, 1], pmax(0, X.test[, 1]), col = 2, lty = 2)

# estimate conditional average treatment effect (CATE) on the full sample 
cate  <- average_treatment_effect(c.forest, target.sample = "all")
print(paste("Conditinal Average Treatment Effect (CATE) is: ", cate[[1]]))

## [1] "Conditinal Average Treatment Effect (CATE) is:  0.405843073161648"

# estimate conditional average treatment effect on treated 
catt  <- average_treatment_effect(c.forest, target.sample = "treated")
paste("Conditional Average Treatment Effect on the Treated (CATT)", catt[[1]])

## [1] "Conditional Average Treatment Effect on the Treated (CATT) 0.492339168239352"

# Add confidence intervals for heterogeneous treatment effects; growing more trees recommended 

tau.forest  <- causal_forest(X, Y, W, num.trees = 4000)
tau.hat  <- predict(tau.forest, X.test, estimate.variance = TRUE) # for the test sample 

ul  <- tau.hat$predictions + 1.96 * sqrt(tau.hat$variance.estimates)
ll  <- tau.hat$predictions - 1.96 * sqrt(tau.hat$variance.estimates)

tau.hat$ul  <-  ul 
tau.hat$ll  <- ll 
tau.hat$X.test  <- X.test[,1]

ggplot(data = tau.hat, aes(x = X.test, y = predictions)) + 
geom_ribbon(aes(ymin = ll, ymax = ul), fill = "grey70") + geom_line(aes(y = predictions)) + 
theme_bw()

######################################################
#
#
# In some cases prefitting Y and W separately may 
# be helpful. Say they use different covariates.
#
######################################################

# Generate a new data
n <- 4000
p <- 20
X <- matrix(rnorm(n * p), n, p)
TAU <- 1 / (1 + exp(-X[, 3]))
W <- rbinom(n, 1, 1 / (1 + exp(-X[, 1] - X[, 2]))) # X[, 1] and X[, 2] influence W
Y <- pmax(X[, 2] + X[, 3], 0) + rowMeans(X[, 4:6]) / 2 + W * TAU + rnorm(n) # X[, 2], X[, 3], X[, 4:6] influence Y. So different set of Xs influence Y



# Build a separate forest for Y and W 
forest.W  <-  regression_forest(X, W, tune.parameters = "all")
W.hat  <- predict(forest.W)$predictions # this gives us the estimated propensity score (probability of treated)
#plot(W.hat, X[, 1], col = as.factor(W))
#plot(W.hat, X[, 2], col = as.factor(W))

forest.Y  <- regression_forest(X, Y, tune.parameters = "all") # note that W is not used here 
Y.hat  <- predict(forest.Y)$predictions # this gives the conditional mean of Y or m(x)
#plot(Y, Y.hat)

forest.Y.varimp  <- variable_importance(forest.Y)
forest.Y.varimp

##              [,1]
##  [1,] 0.002939109
##  [2,] 0.464324588
##  [3,] 0.386426876
##  [4,] 0.040967280
##  [5,] 0.025417033
##  [6,] 0.053186112
##  [7,] 0.002563865
##  [8,] 0.001711964
##  [9,] 0.001464095
## [10,] 0.001727238
## [11,] 0.002546653
## [12,] 0.001280970
## [13,] 0.002841205
## [14,] 0.002328143
## [15,] 0.001422961
## [16,] 0.001131292
## [17,] 0.001542447
## [18,] 0.002144746
## [19,] 0.002829431
## [20,] 0.001203992

# selects the important variables
selected.vars  <- which(forest.Y.varimp / mean(forest.Y.varimp) > 0.2)
selected.vars

## [1] 2 3 4 5 6

# Trains a causal forest
tau.forest  <- causal_forest(X[, selected.vars], Y, W, 
                    W.hat = W.hat, Y.hat = Y.hat, # specify e(x) and m(x)
                    tune.parameters = "all")


# See if a causal forest succeeded in capturing heterogeneity by plotting
# the TOC and calculating a 95% CI for the AUTOC.
train <- sample(1:n, n / 2)
train.forest <- causal_forest(X[train, ], Y[train], W[train])
eval.forest <- causal_forest(X[-train, ], Y[-train], W[-train])
rate <- rank_average_treatment_effect(eval.forest,
                                      predict(train.forest, X[-train, ])$predictions)
rate

##      estimate    std.err             target
##  -0.002787873 0.04889062 priorities | AUTOC

plot(rate)

paste("AUTOC:", round(rate$estimate, 2), "+/", round(1.96 * rate$std.err, 2))

## [1] "AUTOC: 0 +/ 0.1"