Trees
set.seed(47)
x1 <- runif(50, 0, 10)
x2 <- runif(50, -10, 10)
e <- rnorm(50, 0, 5)
response <- 20 + x1^2 + x2^2 + e
rfdata <- as_tibble(cbind(x1,x2,response))
ggplot(rfdata, aes(x=x1, y=x2)) + geom_point(aes(color=response), size=2)
set.seed(47)
rfmodel1 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=1))
summary(rfmodel1)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 1))
## n= 50
##
## CP nsplit rel error xerror xstd
## 1 0.5177082 0 1.0000000 1.0346449 0.18884709
## 2 0.0100000 1 0.4822918 0.5233518 0.08016401
##
## Variable importance
## x1
## 100
##
## Node number 1: 50 observations, complexity param=0.5177082
## mean=88.76254, MSE=1705.718
## left son=2 (39 obs) right son=3 (11 obs)
## Primary splits:
## x1 < 7.551607 to the left, improve=0.5177082, (0 missing)
## x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
##
## Node number 2: 39 observations
## mean=72.98061, MSE=806.5541
##
## Node number 3: 11 observations
## mean=144.7167, MSE=879.7351
rpart.plot(rfmodel1)
ggplot(rfdata, aes(x=x1, y=x2)) + geom_point(aes(color=response), size=2) +
geom_vline(aes(xintercept=7.6))
set.seed(47)
rfmodel2 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=2))
summary(rfmodel2)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 2))
## n= 50
##
## CP nsplit rel error xerror xstd
## 1 0.5177082 0 1.0000000 1.0346449 0.18884709
## 2 0.1621953 1 0.4822918 0.5233518 0.08016401
## 3 0.0100000 2 0.3200965 0.4045381 0.05519120
##
## Variable importance
## x1 x2
## 76 24
##
## Node number 1: 50 observations, complexity param=0.5177082
## mean=88.76254, MSE=1705.718
## left son=2 (39 obs) right son=3 (11 obs)
## Primary splits:
## x1 < 7.551607 to the left, improve=0.5177082, (0 missing)
## x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
##
## Node number 2: 39 observations, complexity param=0.1621953
## mean=72.98061, MSE=806.5541
## left son=4 (32 obs) right son=5 (7 obs)
## Primary splits:
## x2 < -7.067915 to the right, improve=0.4397617, (0 missing)
## x1 < 3.696997 to the left, improve=0.2901931, (0 missing)
##
## Node number 3: 11 observations
## mean=144.7167, MSE=879.7351
##
## Node number 4: 32 observations
## mean=64.17216, MSE=470.8777
##
## Node number 5: 7 observations
## mean=113.2478, MSE=364.936
rpart.plot(rfmodel2)
ggplot(rfdata, aes(x=x1, y=x2)) + geom_point(aes(color=response), size=2) +
geom_vline(aes(xintercept=7.6)) +
geom_segment(x=0, y=-7.1, xend=7.6, yend=-7.1)
set.seed(47)
rfmodel3 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=3))
summary(rfmodel3)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 3))
## n= 50
##
## CP nsplit rel error xerror xstd
## 1 0.51770823 0 1.0000000 1.0346449 0.18884709
## 2 0.16219526 1 0.4822918 0.5233518 0.08016401
## 3 0.09159405 2 0.3200965 0.4838951 0.07945320
## 4 0.01000000 3 0.2285025 0.3265122 0.04880900
##
## Variable importance
## x1 x2
## 77 23
##
## Node number 1: 50 observations, complexity param=0.5177082
## mean=88.76254, MSE=1705.718
## left son=2 (39 obs) right son=3 (11 obs)
## Primary splits:
## x1 < 7.551607 to the left, improve=0.5177082, (0 missing)
## x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
##
## Node number 2: 39 observations, complexity param=0.1621953
## mean=72.98061, MSE=806.5541
## left son=4 (32 obs) right son=5 (7 obs)
## Primary splits:
## x2 < -7.067915 to the right, improve=0.4397617, (0 missing)
## x1 < 3.696997 to the left, improve=0.2901931, (0 missing)
##
## Node number 3: 11 observations
## mean=144.7167, MSE=879.7351
##
## Node number 4: 32 observations, complexity param=0.09159405
## mean=64.17216, MSE=470.8777
## left son=8 (23 obs) right son=9 (9 obs)
## Primary splits:
## x1 < 5.50053 to the left, improve=0.5184256, (0 missing)
## x2 < 5.203514 to the left, improve=0.2453159, (0 missing)
## Surrogate splits:
## x2 < 6.625161 to the left, agree=0.781, adj=0.222, (0 split)
##
## Node number 5: 7 observations
## mean=113.2478, MSE=364.936
##
## Node number 8: 23 observations
## mean=54.39855, MSE=237.4702
##
## Node number 9: 9 observations
## mean=89.14914, MSE=199.399
rpart.plot(rfmodel3)
ggplot(rfdata, aes(x=x1, y=x2)) +
geom_point(aes(color=response), size=2) +
geom_vline(aes(xintercept=7.6)) +
geom_segment(x=5.5, y=-7.1, xend=5.5, yend=11) +
geom_segment(x=0, y=-7.1, xend=7.6, yend=-7.1)
set.seed(47)
rfmodel4 <- rpart(response ~ x1+x2, data=rfdata, control=rpart.control(maxdepth=4))
summary(rfmodel4)
## Call:
## rpart(formula = response ~ x1 + x2, data = rfdata, control = rpart.control(maxdepth = 4))
## n= 50
##
## CP nsplit rel error xerror xstd
## 1 0.51770823 0 1.0000000 1.0346449 0.18884709
## 2 0.16219526 1 0.4822918 0.5233518 0.08016401
## 3 0.09159405 2 0.3200965 0.4838951 0.07945320
## 4 0.02354361 3 0.2285025 0.3265122 0.04880900
## 5 0.01000000 4 0.2049589 0.3185294 0.05248709
##
## Variable importance
## x1 x2
## 75 25
##
## Node number 1: 50 observations, complexity param=0.5177082
## mean=88.76254, MSE=1705.718
## left son=2 (39 obs) right son=3 (11 obs)
## Primary splits:
## x1 < 7.551607 to the left, improve=0.5177082, (0 missing)
## x2 < -8.102921 to the right, improve=0.3737130, (0 missing)
##
## Node number 2: 39 observations, complexity param=0.1621953
## mean=72.98061, MSE=806.5541
## left son=4 (32 obs) right son=5 (7 obs)
## Primary splits:
## x2 < -7.067915 to the right, improve=0.4397617, (0 missing)
## x1 < 3.696997 to the left, improve=0.2901931, (0 missing)
##
## Node number 3: 11 observations
## mean=144.7167, MSE=879.7351
##
## Node number 4: 32 observations, complexity param=0.09159405
## mean=64.17216, MSE=470.8777
## left son=8 (23 obs) right son=9 (9 obs)
## Primary splits:
## x1 < 5.50053 to the left, improve=0.5184256, (0 missing)
## x2 < 5.203514 to the left, improve=0.2453159, (0 missing)
## Surrogate splits:
## x2 < 6.625161 to the left, agree=0.781, adj=0.222, (0 split)
##
## Node number 5: 7 observations
## mean=113.2478, MSE=364.936
##
## Node number 8: 23 observations, complexity param=0.02354361
## mean=54.39855, MSE=237.4702
## left son=16 (15 obs) right son=17 (8 obs)
## Primary splits:
## x2 < 4.075384 to the left, improve=0.36763210, (0 missing)
## x1 < 3.705048 to the left, improve=0.08479596, (0 missing)
##
## Node number 9: 9 observations
## mean=89.14914, MSE=199.399
##
## Node number 16: 15 observations
## mean=47.575, MSE=200.7863
##
## Node number 17: 8 observations
## mean=67.19272, MSE=55.26009
rpart.plot(rfmodel4)
ggplot(rfdata, aes(x=x1, y=x2)) +
geom_point(aes(color=response), size=2) +
geom_vline(aes(xintercept=7.6)) +
geom_segment(x=5.5, y=-7.1, xend=5.5, yend=11) +
geom_segment(x=0, y=-7.1, xend=7.6, yend=-7.1) +
geom_segment(x=0, y=4.1, xend=5.5, yend=4.1)