data <- read.csv("../data/Pokemon.csv")
dim(data)
head(data)

length(unique(data$number))
levels(as.factor(data$generation)) #on peut aussi écrire simplement unique(data$generation)

# Recherche des numéros en double:
dbl <- unlist(lapply(unique(data$number), function(n) { if (sum(data$number == n) >= 2) return(n); return(NULL) }))
# Note: astuce, NULL dans une liste est ignoré par unlist(). Il y a plein d'autres façons de faire hein.
# Note2: attention length(data$number == n) renverrait 1072 ; il faut utiliser sum() ici.
dbl

data[data$number == 27,]
data[data$number == 445,]
data[data$number == 844,]

dim(data[data$legendary,]) #Awai, 118 quand-même...
head(data[data$legendary,])

# Pokemon légendaire = plus puissant ?
options(repr.plot.width=15, repr.plot.height=10)
hist((1:nrow(data))[ data[order(data$total, decreasing=TRUE), "legendary"] ], xlab="Rang", ylab="Compte", main="")
# Répartition des pokemons légendaires dans la liste par puissances décroissantes:
# Conclusion = oui, plutôt.

data_clust <- subset(data, select=c("hp", "attack", "defense", "sp_attack", "sp_defense", "speed"))
rownames(data_clust) <- data$name #utile pour les graphes

distances <- dist(data_clust)
h <- hclust(distances, method="ward.D") #distance de Ward: en général bons résultats
plot(h, labels=substr(data$name, 1, 5)) #substr(x, 1, 5): 5 premiers caractères (très moche sinon)

library(factoextra)
cl2_h <- cutree(h, 2)
fviz_cluster(list(data=data_clust, cluster=cl2_h))

Loading required package: ggplot2

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

par(mfrow=c(1,2))
hist(data[cl2_h==1, "total"], main="Groupe 1 à gauche")
hist(data[cl2_h==2, "total"], main="Groupe 2 à droite")

cl3_h <- cutree(h, 3)
fviz_cluster(list(data=data_clust, cluster=cl3_h))

data[which(data$name %in% c("Shuckle", "Deoxys Attack Forme")),]

library(cluster)
cl2_p <- pam(data_clust, 2)
fviz_cluster(cl2_p)
cl2_k <- kmeans(data_clust, 2)
fviz_cluster(cl2_k, data=data_clust)

cl3_p <- pam(data_clust, 3)
fviz_cluster(cl3_p)
cl3_k <- kmeans(data_clust, 3)
fviz_cluster(cl3_k, data=data_clust)

par(mfrow=c(1,2))
hist(cl3_p$clustering)
hist(cl3_k$cluster)

# D'abord, augmenter nstart pour améliorer le résultat du kmeans:
cl2_k <- kmeans(data_clust, 2, nstart=10)
cl3_k <- kmeans(data_clust, 3, nstart=10)

# Concernant PAM, l'aide (?pam) indique
# "By default, when ‘medoids’ are not specified, the algorithm first looks for a good initial set of medoids".
# On fait donc confiance disons.

library(clusterCrit)
int_crit <- "Dunn"

# intCriteria demande une matrice de réels en entrée (peu flexible...):
mdata <- apply(data_clust, 2, as.numeric)
intcrit_data <- matrix(
  c(intCriteria(mdata, cl2_k$cluster, int_crit),
    intCriteria(mdata, cl2_p$clustering, int_crit),
    intCriteria(mdata, cl2_h, int_crit),
    intCriteria(mdata, cl3_k$cluster, int_crit),
    intCriteria(mdata, cl3_p$clustering, int_crit),
    intCriteria(mdata, cl3_h, int_crit)),
  nrow=3, ncol=2)
barplot(intcrit_data, main="Index values", xlab="Number of clusters",
        col=c("darkblue","red","darkgreen"), legend = c("kmeans", "pam"," hclust"))

ext_crit <- "Rand"

extcrit_data <- matrix(
  c(extCriteria(cl2_k$cluster, cl2_p$clustering, ext_crit),
    extCriteria(cl2_p$clustering, cl2_h, ext_crit),
    extCriteria(cl2_h, cl2_k$cluster, ext_crit),
    extCriteria(cl3_k$cluster, cl3_p$clustering, ext_crit),
    extCriteria(cl3_p$clustering, cl3_h, ext_crit),
    extCriteria(cl3_h, cl3_k$cluster, ext_crit)),
  nrow=3, ncol=2)
barplot(extcrit_data, main="Index values", xlab="Number of clusters",
        col=c("darkblue","red","darkgreen"), legend = c("kmeans", "pam"," hclust"))

data1 <- read.table("https://raw.githubusercontent.com/deric/clustering-benchmark/master/src/main/resources/datasets/artificial/cluto-t7-10k.arff", skip=13, sep=",")
target1 <- data1[,3] #0, 1, ..., 8, noise
target1[target1 == "noise"] <- "9" #je préfère un vecteur d'entiers
target1 <- as.integer(target1)
data1 <- data1[,-3]

data2 <- read.table("https://raw.githubusercontent.com/deric/clustering-benchmark/master/src/main/resources/datasets/artificial/3-spiral.arff", skip=12, sep=",")
target2 <- data2[,3] #1, 2, 3
data2 <- data2[,-3]

data3 <- read.table("https://raw.githubusercontent.com/deric/clustering-benchmark/master/src/main/resources/datasets/artificial/diamond9.arff", skip=9, sep=",")
target3 <- data3[,3] #0, 1, ..., 8
data3 <- data3[,-3]

data4 <- read.table("https://raw.githubusercontent.com/deric/clustering-benchmark/master/src/main/resources/datasets/artificial/target.arff", skip=18, sep=",")
target4 <- data4[,3] #1, 2 (3, 4, 5, 6: points isolés)
data4 <- data4[,-3]

# Regroupements à retrouver :
par(mfrow=c(2,2))
plot(data1, col=rainbow(10)[target1+1])
plot(data2, col=rainbow(3)[target2])
plot(data3, col=rainbow(9)[target3+1])
plot(data4, col=rev(rainbow(6))[target4])

# C'est parti !
cl <- function(data, k) kmeans(data, k, nstart=10)$cluster
plotAll <- function() {
  par(mfrow=c(2,2))
  plot(data1, col=cl(data1, 10))
  plot(data2, col=cl(data2, 3))
  plot(data3, col=cl(data3, 9))
  plot(data4, col=cl(data4, 6))
}
plotAll()

cl <- function(data, k) pam(data, k)$clustering
plotAll() #temps de calcul déjà beaucoup (beaucoup) plus long ! ...comme évoqué en cours

cl <- function(data, k) cutree(hclust(dist(data), method="ward.D"), k)
plotAll()

cl <- function(data, k) cutree(hclust(dist(data), method="single"), k)
plotAll()

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<lgl>
1	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	FALSE
2	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	FALSE
3	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	FALSE
4	3	Mega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	FALSE
5	3	Gigantamax Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	FALSE
6	4	Charmander	Fire		309	39	52	43	60	50	65	1	FALSE

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<lgl>
41	27	Sandshrew	Ground		300	50	75	85	20	30	40	1	FALSE
42	27	Alolan Sandshrew	Ice	Steel	300	50	75	90	10	35	40	7	FALSE

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<lgl>
539	445	Garchomp	Dragon	Ground	600	108	130	95	80	85	102	4	FALSE
540	445	Mega Garchomp	Dragon	Ground	700	108	170	115	120	95	92	4	FALSE

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<lgl>
997	844	Sandaconda	Ground		510	72	107	125	65	70	71	8	FALSE
998	844	Gigantamax Sandaconda	Ground		510	72	107	125	65	70	71	8	FALSE

	number	name	type1	type2	total	hp	attack	defense	sp_attack	sp_defense	speed	generation	legendary
	<int>	<chr>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<lgl>
195	144	Articuno	Ice	Flying	580	90	85	100	95	125	85	1	TRUE
196	144	Galarian Articuno	Psychic	Flying	580	90	85	85	125	100	95	8	TRUE
197	145	Zapdos	Electric	Flying	580	90	90	85	125	90	100	1	TRUE
198	145	Galarian Zapdos	Fighting	Flying	580	90	125	90	85	90	100	8	TRUE
199	146	Moltres	Fire	Flying	580	90	100	90	125	85	90	1	TRUE
200	146	Galarian Moltres	Dark	Flying	580	90	85	90	100	125	90	8	TRUE

Exercice 1¶

Clustering hiérarchique¶

PAM, kmeans¶

Stabilité¶

Exercice 2¶