El creador de este paquete fue Martin Borkovec. Niyaz Madin junto a él fueron los autores de el mismo. En adición, este paquete fue publicado en el 2019.
Ya hemos utilizado gráficos “geom_histogram”, “geom_boxplot” y “geom_violin”. Además, hemos utiizado los diagramas de “heatmap” para mostrar las relaciones entre las características numéricas.
Algunas veces sucede que la relación lineal general entre dos variables no es tan significativa hasta que encontramos la forma correcta de establecer una diferencia en nuestro “dataset”. Por consecuencia, la capacidad de otras características tiene el rol de modificar o controlar la relación entre nuestras variables deseadas.
El paquete ‘ggparty’ se introducé para ayudarnos con este problema. GGparty tiene como objetivo extender la funcionalidad de ggplot2 en el paquete partykit. Provee las herramientas necesarias para crear visualizaciones claramente estructuradas y altamente personalizables para modelos de regresión con estructura de árbol de la clase ‘party’.
library(ggparty)
library(ggplot2)
library(partykit)
library(grid)
library(libcoin)
library(mvtnorm)
library(dplyr)
library(party)
Para que los datos se representen como modelos de regresión con estructura de árbol se utilizan dos “basic building blocks”: “splits of class ‘partysplit’” y “nodes of class ‘partynode’”. Lo resultante se puede asocir a un objeto de clase ‘party’.
WeatherPlay
sp_o <- partysplit(1L, index = 1:3)
sp_h <- partysplit(3L, breaks = 75)
sp_w <- partysplit(4L, index = 1:2)
pn <- partynode(1L, split = sp_o, kids = list(
partynode(2L, split = sp_h, kids = list(
partynode(3L, info = "yes"),
partynode(4L, info = "no"))),
partynode(5L, info = "yes"),
partynode(6L, split = sp_w, kids = list(
partynode(7L, info = "yes"),
partynode(8L, info = "no")))))
py <- party(pn, WeatherPlay)
Al final, ‘party’ dividira los datos en la forma más básica de árbol
print(py)
plot(py)
geom_edge() dibuja los bordes entre los nodos
geom_edge_label() etiqueta los bordes con las divisiones correspondientes
geom_node_label() etiqueta los nodos con la variable dividida, información del nodo o cualquier otra cosa.
geom_node_plot() crea un ggplot personalizado en la ubicación del nodo
ggparty(py) +
geom_edge() +
geom_edge_label() +
geom_node_label(aes(label = splitvar),
ids = "inner") +
geom_node_label(aes(label = info),
ids = "terminal")
Cambiar el color y tamaño de los nodos
ggparty(py) +
geom_edge() +
geom_edge_label() +
# map color to level and size to nodesize for all nodes
geom_node_splitvar(aes(col = factor(level),
size = nodesize)) +
geom_node_info(aes(col = factor(level),
size = nodesize))
n1 <- partynode(id = 1L, split = sp_o, kids = lapply(2L:4L, partynode))
t2 <- party(n1,
data = WeatherPlay,
fitted = data.frame(
"(fitted)" = fitted_node(n1, data = WeatherPlay),
"(response)" = WeatherPlay$play,
check.names = FALSE),
terms = terms(play ~ ., data = WeatherPlay)
)
t2 <- as.constparty(t2)
t2
##
## Model formula:
## play ~ outlook + temperature + humidity + windy
##
## Fitted party:
## [1] root
## | [2] outlook in sunny: no (n = 5, err = 40.0%)
## | [3] outlook in overcast: yes (n = 4, err = 0.0%)
## | [4] outlook in rainy: yes (n = 5, err = 40.0%)
##
## Number of inner nodes: 1
## Number of terminal nodes: 3
Para visualizar la distribución de la variables usaremos la función geom_node_plot(). Nos permite mostrar los datos de cada nodo en su gráfica separada.
ggparty(t2) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(geom_bar(aes(x = "", fill = play),
position = position_fill()),
xlab("play")))
data("TeachingRatings", package = "AER")
head(TeachingRatings)
## minority age gender credits beauty eval division native tenure students
## 1 yes 36 female more 0.2899157 4.3 upper yes yes 24
## 2 no 59 male more -0.7377322 4.5 upper yes yes 17
## 3 no 51 male more -0.5719836 3.7 upper yes yes 55
## 4 no 40 female more -0.6779634 4.3 upper yes yes 40
## 5 no 31 female more 1.5097940 4.4 upper yes yes 42
## 6 no 62 male more 0.5885687 4.2 upper yes yes 182
## allstudents prof
## 1 43 1
## 2 20 2
## 3 55 3
## 4 46 4
## 5 48 5
## 6 282 6
TeachingRatings
tr <- subset(TeachingRatings, credits == "more")
tr_tree <- lmtree(eval ~ beauty | minority + age + gender + division + native +
tenure, data = tr, weights = students, caseweights = FALSE)
tr
Se utiliza “predict” para mostrar la “prediction line” o “trend line” de las gráficas. El “trend line” es una linea utilizada para mostrar el patrón o manifestación general de los valores.
ggparty(tr_tree) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(geom_point(aes(x = beauty,
y = eval,
col = tenure,
shape = minority),
alpha = 0.8),
theme_bw(base_size = 10)),
shared_axis_labels = TRUE,
legend_separator = TRUE,
# predict based on variable
predict = "beauty",
# graphical parameters for geom_line of predictions
predict_gpar = list(col = "blue",
size = 1.2)
)
Para hacer que nuestro “plot” más bonito, usamos el paquete de ggparty. Un patrón interesante ocurre en el nodo 8.
ggparty(tr_tree,
terminal_space = 0.3,
add_vars = list(p.value = "$node$info$p.value")) +
geom_edge(size = 1.5) +
geom_edge_label(colour = "grey", size = 3) +
geom_node_plot(gglist = list(geom_point(aes(x = beauty,
y = eval),
alpha = 0.5),
theme_bw(base_size = 5)),
scales = "fixed",
id = "terminal",
shared_axis_labels = T,
shared_legend = T,
legend_separator = T,
predict = "beauty",
predict_gpar = list(col = "blue",
size = 1.2)
) +
geom_node_label(aes(col = splitvar),
line_list = list(aes(label = paste("Node", id)),
aes(label = splitvar),
aes(label = paste("p =", formatC(p.value, format = "e", digits = 2)))),
line_gpar = list(list(size = 10, col = "black", fontface = "bold"),
list(size = 10),
list(size = 5)),
ids = "inner") +
geom_node_label(aes(label = paste0("Node ", id, ", N = ", nodesize)),
fontface = "bold",
ids = "terminal",
size = 2.5,
nudge_y = 0.02) +
theme(legend.position = "none")
data("CollegeDistance", package = "AER")
head(CollegeDistance)
cd <- subset(CollegeDistance)
cd_tree <- lmtree(score ~ education | income + gender +
urban, data = cd, weights = distance, caseweights = FALSE)
head(cd)
## gender ethnicity score fcollege mcollege home urban unemp wage distance
## 1 male other 39.15 yes no yes yes 6.2 8.09 0.2
## 2 female other 48.87 no no yes yes 6.2 8.09 0.2
## 3 male other 48.74 no no yes yes 6.2 8.09 0.2
## 4 male afam 40.40 no no yes yes 6.2 8.09 0.2
## 5 female other 40.48 no no no yes 5.6 8.09 0.4
## 6 male other 54.71 no no yes yes 5.6 8.09 0.4
## tuition education income region
## 1 0.88915 12 high other
## 2 0.88915 12 low other
## 3 0.88915 12 low other
## 4 0.88915 12 low other
## 5 0.88915 13 low other
## 6 0.88915 12 low other
ggparty(cd_tree) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(geom_point(aes(x = education,
y = score,
col = urban),
alpha = 0.8),
theme_bw(base_size = 10)),
shared_axis_labels = TRUE,
legend_separator = TRUE,
# predict based on variable
predict = "education",
# graphical parameters for geom_line of predictions
predict_gpar = list(col = "blue",
size = 1.2)
)
ggparty(cd_tree,
terminal_space = 0.3,
add_vars = list(p.value = "$node$info$p.value")) +
geom_edge(size = 1.5) +
geom_edge_label(colour = "mediumpurple2", size = 3) +
geom_node_plot(gglist = list(geom_point(aes(x = education,
y = score),
alpha = 0.5),
theme_bw(base_size = 5)),
scales = "fixed",
id = "terminal",
shared_axis_labels = T,
shared_legend = T,
legend_separator = T,
predict = "education",
predict_gpar = list(col = "deeppink3",
size = 1.2)
) +
geom_node_label(aes(col = splitvar),
line_list = list(aes(label = paste(id)),
aes(label = splitvar),
aes(label = paste("p =", formatC(p.value, format = "e", digits = 2)))),
line_gpar = list(list(size = 10, col = "black", fontface = "bold"),
list(size = 10),
list(size = 5)),
ids = "inner") +
geom_node_label(aes(label = paste0(id, ", N = ", nodesize)),
fontface = "bold",
ids = "terminal",
size = 2.5,
nudge_y = 0.02)+
theme(legend.position = "none")