daisy()
with the metric = c("gower")
from the cluster
package.
#----- -----# # , , , , library(dplyr) # set.seed(40) # # ; data.frame() # , 200 1 200 id.s <- c(1:200) %>% factor() budget.s <- sample(c("small", "med", "large"), 200, replace = T) %>% factor(levels=c("small", "med", "large"), ordered = TRUE) origins.s <- sample(c("x", "y", "z"), 200, replace = T, prob = c(0.7, 0.15, 0.15)) area.s <- sample(c("area1", "area2", "area3", "area4"), 200, replace = T, prob = c(0.3, 0.1, 0.5, 0.2)) source.s <- sample(c("facebook", "email", "link", "app"), 200, replace = T, prob = c(0.1,0.2, 0.3, 0.4)) ## — dow.s <- sample(c("mon", "tue", "wed", "thu", "fri", "sat", "sun"), 200, replace = T, prob = c(0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2)) %>% factor(levels=c("mon", "tue", "wed", "thu", "fri", "sat", "sun"), ordered = TRUE) # dish.s <- sample(c("delicious", "the one you don't like", "pizza"), 200, replace = T) # data.frame() synthetic.customers <- data.frame(id.s, budget.s, origins.s, area.s, source.s, dow.s, dish.s) #----- -----# library(cluster) # # : daisy(), diana(), clusplot() gower.dist <- daisy(synthetic.customers[ ,2:7], metric = c("gower")) # class(gower.dist) ## ,
n
clusters, where n
is the number of observations: it is assumed that each of them is a separate cluster. Then the algorithm tries to find and group the most similar data points among themselves - this is how cluster formation begins.
# # — — #------------ ------------# divisive.clust <- diana(as.matrix(gower.dist), diss = TRUE, keep.diss = TRUE) plot(divisive.clust, main = "Divisive")
#------------ ------------# # # — , # (complete linkages) aggl.clust.c <- hclust(gower.dist, method = "complete") plot(aggl.clust.c, main = "Agglomerative, complete linkages")
# , # , , — # , , , , library(fpc) cstats.table <- function(dist, tree, k) { clust.assess <- c("cluster.number","n","within.cluster.ss","average.within","average.between", "wb.ratio","dunn2","avg.silwidth") clust.size <- c("cluster.size") stats.names <- c() row.clust <- c() output.stats <- matrix(ncol = k, nrow = length(clust.assess)) cluster.sizes <- matrix(ncol = k, nrow = k) for(i in c(1:k)){ row.clust[i] <- paste("Cluster-", i, " size") } for(i in c(2:k)){ stats.names[i] <- paste("Test", i-1) for(j in seq_along(clust.assess)){ output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j] } for(d in 1:k) { cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d] dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1) cluster.sizes[d, i] } } output.stats.df <- data.frame(output.stats) cluster.sizes <- data.frame(cluster.sizes) cluster.sizes[is.na(cluster.sizes)] <- 0 rows.all <- c(clust.assess, row.clust) # rownames(output.stats.df) <- clust.assess output <- rbind(output.stats.df, cluster.sizes)[ ,-1] colnames(output) <- stats.names[2:k] rownames(output) <- rows.all is.num <- sapply(output, is.numeric) output[is.num] <- lapply(output[is.num], round, 2) output } # : 7 # , stats.df.divisive <- cstats.table(gower.dist, divisive.clust, 7) stats.df.divisive
stats.df.aggl <-cstats.table(gower.dist, aggl.clust.c, 7) #
stats.df.aggl
# --------- ---------# # «» # , 7 library(ggplot2) # # ggplot(data = data.frame(t(cstats.table(gower.dist, divisive.clust, 15))), aes(x=cluster.number, y=within.cluster.ss)) + geom_point()+ geom_line()+ ggtitle("Divisive clustering") + labs(x = "Num.of clusters", y = "Within clusters sum of squares (SS)") + theme(plot.title = element_text(hjust = 0.5))
# ggplot(data = data.frame(t(cstats.table(gower.dist, aggl.clust.c, 15))), aes(x=cluster.number, y=within.cluster.ss)) + geom_point()+ geom_line()+ ggtitle("Agglomerative clustering") + labs(x = "Num.of clusters", y = "Within clusters sum of squares (SS)") + theme(plot.title = element_text(hjust = 0.5))
# ggplot(data = data.frame(t(cstats.table(gower.dist, divisive.clust, 15))), aes(x=cluster.number, y=avg.silwidth)) + geom_point()+ geom_line()+ ggtitle("Divisive clustering") + labs(x = "Num.of clusters", y = "Average silhouette width") + theme(plot.title = element_text(hjust = 0.5))
ggplot(data = data.frame(t(cstats.table(gower.dist, aggl.clust.c, 15))), aes(x=cluster.number, y=avg.silwidth)) + geom_point()+ geom_line()+ ggtitle("Agglomerative clustering") + labs(x = "Num.of clusters", y = "Average silhouette width") + theme(plot.title = element_text(hjust = 0.5))
k=30
I got the following graph:
library("ggplot2") library("reshape2") library("purrr") library("dplyr") # library("dendextend") dendro <- as.dendrogram(aggl.clust.c) dendro.col <- dendro %>% set("branches_k_color", k = 7, value = c("darkslategray", "darkslategray4", "darkslategray3", "gold3", "darkcyan", "cyan3", "gold3")) %>% set("branches_lwd", 0.6) %>% set("labels_colors", value = c("darkslategray")) %>% set("labels_cex", 0.5) ggd1 <- as.ggdend(dendro.col) ggplot(ggd1, theme = theme_minimal()) + labs(x = "Num. observations", y = "Height", title = "Dendrogram, k = 7")
# ( ) ggplot(ggd1, labels = T) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x")
# — # — # , clust.num <- cutree(aggl.clust.c, k = 7) synthetic.customers.cl <- cbind(synthetic.customers, clust.num) cust.long <- melt(data.frame(lapply(synthetic.customers.cl, as.character), stringsAsFactors=FALSE), id = c("id.s", "clust.num"), factorsAsStrings=T) cust.long.q <- cust.long %>% group_by(clust.num, variable, value) %>% mutate(count = n_distinct(id.s)) %>% distinct(clust.num, variable, value, count) # heatmap.c , — , , heatmap.c <- ggplot(cust.long.q, aes(x = clust.num, y = factor(value, levels = c("x","y","z", "mon", "tue", "wed", "thu", "fri","sat","sun", "delicious", "the one you don't like", "pizza", "facebook", "email", "link", "app", "area1", "area2", "area3", "area4", "small", "med", "large"), ordered = T))) + geom_tile(aes(fill = count))+ scale_fill_gradient2(low = "darkslategray1", mid = "yellow", high = "turquoise4") # cust.long.p <- cust.long.q %>% group_by(clust.num, variable) %>% mutate(perc = count / sum(count)) %>% arrange(clust.num) heatmap.p <- ggplot(cust.long.p, aes(x = clust.num, y = factor(value, levels = c("x","y","z", "mon", "tue", "wed", "thu", "fri","sat", "sun", "delicious", "the one you don't like", "pizza", "facebook", "email", "link", "app", "area1", "area2", "area3", "area4", "small", "med", "large"), ordered = T))) + geom_tile(aes(fill = perc), alpha = 0.85)+ labs(title = "Distribution of characteristics across clusters", x = "Cluster number", y = NULL) + geom_hline(yintercept = 3.5) + geom_hline(yintercept = 10.5) + geom_hline(yintercept = 13.5) + geom_hline(yintercept = 17.5) + geom_hline(yintercept = 21.5) + scale_fill_gradient2(low = "darkslategray1", mid = "yellow", high = "turquoise4") heatmap.p
Source: https://habr.com/ru/post/461741/