Trees

set.seed(47)
x1 <- runif(50, 0, 10)
x2 <- runif(50, -10, 10)
e <- rnorm(50, 0, 5)
response <- 20 + x1^2 + x2^2 + e

rfdata <- as_tibble(cbind(x1,x2,response))

ggplot(rfdata, aes(x=x1, y=x2)) + geom_point(aes(color=response), size=2)

set.seed(47)
rfmodel1 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=1))
summary(rfmodel1)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 1))
##   n= 50 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.5177082      0 1.0000000 1.0346449 0.18884709
## 2 0.0100000      1 0.4822918 0.5233518 0.08016401
## 
## Variable importance
##  x1 
## 100 
## 
## Node number 1: 50 observations,    complexity param=0.5177082
##   mean=88.76254, MSE=1705.718 
##   left son=2 (39 obs) right son=3 (11 obs)
##   Primary splits:
##       x1 < 7.551607  to the left,  improve=0.5177082, (0 missing)
##       x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
## 
## Node number 2: 39 observations
##   mean=72.98061, MSE=806.5541 
## 
## Node number 3: 11 observations
##   mean=144.7167, MSE=879.7351
rpart.plot(rfmodel1)

ggplot(rfdata, aes(x=x1, y=x2)) + geom_point(aes(color=response), size=2) + 
  geom_vline(aes(xintercept=7.6))

set.seed(47)
rfmodel2 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=2))
summary(rfmodel2)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 2))
##   n= 50 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.5177082      0 1.0000000 1.0346449 0.18884709
## 2 0.1621953      1 0.4822918 0.5233518 0.08016401
## 3 0.0100000      2 0.3200965 0.4045381 0.05519120
## 
## Variable importance
## x1 x2 
## 76 24 
## 
## Node number 1: 50 observations,    complexity param=0.5177082
##   mean=88.76254, MSE=1705.718 
##   left son=2 (39 obs) right son=3 (11 obs)
##   Primary splits:
##       x1 < 7.551607  to the left,  improve=0.5177082, (0 missing)
##       x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
## 
## Node number 2: 39 observations,    complexity param=0.1621953
##   mean=72.98061, MSE=806.5541 
##   left son=4 (32 obs) right son=5 (7 obs)
##   Primary splits:
##       x2 < -7.067915 to the right, improve=0.4397617, (0 missing)
##       x1 < 3.696997  to the left,  improve=0.2901931, (0 missing)
## 
## Node number 3: 11 observations
##   mean=144.7167, MSE=879.7351 
## 
## Node number 4: 32 observations
##   mean=64.17216, MSE=470.8777 
## 
## Node number 5: 7 observations
##   mean=113.2478, MSE=364.936
rpart.plot(rfmodel2)

ggplot(rfdata, aes(x=x1, y=x2)) + geom_point(aes(color=response), size=2) + 
  geom_vline(aes(xintercept=7.6)) + 
  geom_segment(x=0, y=-7.1, xend=7.6, yend=-7.1)

set.seed(47)
rfmodel3 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=3))
summary(rfmodel3)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 3))
##   n= 50 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.51770823      0 1.0000000 1.0346449 0.18884709
## 2 0.16219526      1 0.4822918 0.5233518 0.08016401
## 3 0.09159405      2 0.3200965 0.4838951 0.07945320
## 4 0.01000000      3 0.2285025 0.3265122 0.04880900
## 
## Variable importance
## x1 x2 
## 77 23 
## 
## Node number 1: 50 observations,    complexity param=0.5177082
##   mean=88.76254, MSE=1705.718 
##   left son=2 (39 obs) right son=3 (11 obs)
##   Primary splits:
##       x1 < 7.551607  to the left,  improve=0.5177082, (0 missing)
##       x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
## 
## Node number 2: 39 observations,    complexity param=0.1621953
##   mean=72.98061, MSE=806.5541 
##   left son=4 (32 obs) right son=5 (7 obs)
##   Primary splits:
##       x2 < -7.067915 to the right, improve=0.4397617, (0 missing)
##       x1 < 3.696997  to the left,  improve=0.2901931, (0 missing)
## 
## Node number 3: 11 observations
##   mean=144.7167, MSE=879.7351 
## 
## Node number 4: 32 observations,    complexity param=0.09159405
##   mean=64.17216, MSE=470.8777 
##   left son=8 (23 obs) right son=9 (9 obs)
##   Primary splits:
##       x1 < 5.50053   to the left,  improve=0.5184256, (0 missing)
##       x2 < 5.203514  to the left,  improve=0.2453159, (0 missing)
##   Surrogate splits:
##       x2 < 6.625161  to the left,  agree=0.781, adj=0.222, (0 split)
## 
## Node number 5: 7 observations
##   mean=113.2478, MSE=364.936 
## 
## Node number 8: 23 observations
##   mean=54.39855, MSE=237.4702 
## 
## Node number 9: 9 observations
##   mean=89.14914, MSE=199.399
rpart.plot(rfmodel3)

ggplot(rfdata, aes(x=x1, y=x2)) + 
  geom_point(aes(color=response), size=2) + 
  geom_vline(aes(xintercept=7.6)) + 
  geom_segment(x=5.5, y=-7.1, xend=5.5, yend=11) +  
  geom_segment(x=0, y=-7.1, xend=7.6, yend=-7.1)

set.seed(47)
rfmodel4 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=4))
summary(rfmodel4)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 4))
##   n= 50 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.51770823      0 1.0000000 1.0346449 0.18884709
## 2 0.16219526      1 0.4822918 0.5233518 0.08016401
## 3 0.09159405      2 0.3200965 0.4838951 0.07945320
## 4 0.02354361      3 0.2285025 0.3265122 0.04880900
## 5 0.01000000      4 0.2049589 0.3185294 0.05248709
## 
## Variable importance
## x1 x2 
## 75 25 
## 
## Node number 1: 50 observations,    complexity param=0.5177082
##   mean=88.76254, MSE=1705.718 
##   left son=2 (39 obs) right son=3 (11 obs)
##   Primary splits:
##       x1 < 7.551607  to the left,  improve=0.5177082, (0 missing)
##       x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
## 
## Node number 2: 39 observations,    complexity param=0.1621953
##   mean=72.98061, MSE=806.5541 
##   left son=4 (32 obs) right son=5 (7 obs)
##   Primary splits:
##       x2 < -7.067915 to the right, improve=0.4397617, (0 missing)
##       x1 < 3.696997  to the left,  improve=0.2901931, (0 missing)
## 
## Node number 3: 11 observations
##   mean=144.7167, MSE=879.7351 
## 
## Node number 4: 32 observations,    complexity param=0.09159405
##   mean=64.17216, MSE=470.8777 
##   left son=8 (23 obs) right son=9 (9 obs)
##   Primary splits:
##       x1 < 5.50053   to the left,  improve=0.5184256, (0 missing)
##       x2 < 5.203514  to the left,  improve=0.2453159, (0 missing)
##   Surrogate splits:
##       x2 < 6.625161  to the left,  agree=0.781, adj=0.222, (0 split)
## 
## Node number 5: 7 observations
##   mean=113.2478, MSE=364.936 
## 
## Node number 8: 23 observations,    complexity param=0.02354361
##   mean=54.39855, MSE=237.4702 
##   left son=16 (15 obs) right son=17 (8 obs)
##   Primary splits:
##       x2 < 4.075384  to the left,  improve=0.36763210, (0 missing)
##       x1 < 3.705048  to the left,  improve=0.08479596, (0 missing)
## 
## Node number 9: 9 observations
##   mean=89.14914, MSE=199.399 
## 
## Node number 16: 15 observations
##   mean=47.575, MSE=200.7863 
## 
## Node number 17: 8 observations
##   mean=67.19272, MSE=55.26009
rpart.plot(rfmodel4)

ggplot(rfdata, aes(x=x1, y=x2)) + 
  geom_point(aes(color=response), size=2) + 
  geom_vline(aes(xintercept=7.6)) + 
  geom_segment(x=5.5, y=-7.1, xend=5.5, yend=11) +  
  geom_segment(x=0, y=-7.1, xend=7.6, yend=-7.1) +
  geom_segment(x=0, y=4.1, xend=5.5, yend=4.1)