import pandas as pd
naissances = pd.read_excel("naissances_regions.xls")
naissances
id_region | region | naissances | prematures | moinsde25 | pourcentage_prematures | pourcentage_moinsde25 | |
---|---|---|---|---|---|---|---|
0 | 2 | Boucle du Mouhoun | 69030 | 1325 | 6449 | 1.919455 | 9.342315 |
1 | 3 | Cascades | 26235 | 693 | 3074 | 2.641509 | 11.717172 |
2 | 1 | Centre | 63706 | 1630 | 7849 | 2.558629 | 12.320661 |
3 | 4 | Centre est | 52247 | 1041 | 5056 | 1.992459 | 9.677111 |
4 | 5 | Centre-Nord | 50927 | 899 | 4914 | 1.765272 | 9.649106 |
5 | 6 | Centre-Ouest | 50657 | 1129 | 5856 | 2.228715 | 11.560100 |
6 | 7 | Centre-Sud | 24780 | 408 | 2354 | 1.646489 | 9.499596 |
7 | 8 | Est | 58496 | 1047 | 5208 | 1.789866 | 8.903173 |
8 | 9 | Hauts-Bassins | 70738 | 6189 | 6839 | 8.749187 | 9.668071 |
9 | 10 | Nord | 52753 | 880 | 4473 | 1.668152 | 8.479139 |
10 | 11 | Plateau Central | 32410 | 4750 | 2626 | 14.655970 | 8.102438 |
11 | 12 | Sahel | 35469 | 736 | 3521 | 2.075051 | 9.926978 |
12 | 13 | Sud-Ouest | 22887 | 412 | 2181 | 1.800149 | 9.529427 |
naissances.head()
id_region | region | naissances | prematures | moinsde25 | pourcentage_prematures | pourcentage_moinsde25 | |
---|---|---|---|---|---|---|---|
0 | 2 | Boucle du Mouhoun | 69030 | 1325 | 6449 | 1.919455 | 9.342315 |
1 | 3 | Cascades | 26235 | 693 | 3074 | 2.641509 | 11.717172 |
2 | 1 | Centre | 63706 | 1630 | 7849 | 2.558629 | 12.320661 |
3 | 4 | Centre est | 52247 | 1041 | 5056 | 1.992459 | 9.677111 |
4 | 5 | Centre-Nord | 50927 | 899 | 4914 | 1.765272 | 9.649106 |
naissances.shape
(13, 7)
Le dataframe comporte 13 individus et 7 variables
X = naissances.loc[:,['naissances','prematures','moinsde25']]
X.head()
naissances | prematures | moinsde25 | |
---|---|---|---|
0 | 69030 | 1325 | 6449 |
1 | 26235 | 693 | 3074 |
2 | 63706 | 1630 | 7849 |
3 | 52247 | 1041 | 5056 |
4 | 50927 | 899 | 4914 |
regions = naissances['region'].tolist()
regions
['Boucle du Mouhoun', 'Cascades', 'Centre', 'Centre est', 'Centre-Nord', 'Centre-Ouest', 'Centre-Sud', 'Est', 'Hauts-Bassins', 'Nord', 'Plateau Central', 'Sahel', 'Sud-Ouest']
from sklearn.decomposition import PCA
#Instanciation
pca = PCA(n_components=2)
#Exécution et projection des individus
components = pca.fit_transform(X)
type(components)
numpy.ndarray
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,15))
plt.scatter(components[:,0], components[:,1])
for i, region in enumerate(regions):
x = components[i,0]
y = components[i,1]
plt.annotate(region, (x,y), textcoords="offset points", xytext=(0,10))
plt.grid()
Il semble y avoir des groupes qui se forment: à confirmer
from scipy.cluster.hierarchy import linkage
Z = linkage(X, method="average", metric="euclidean")
from scipy.cluster.hierarchy import dendrogram
dendrogram(Z, labels=regions, orientation='right')
plt.show()
Si on se fie à la coloration du dendrogramme, on devrait retenir deux groupes
from scipy.cluster.hierarchy import fcluster
groupes_cah = fcluster(Z, t=3, criterion='maxclust')
groupes_cah
array([3, 1, 3, 2, 2, 2, 1, 3, 3, 2, 1, 1, 1], dtype=int32)
from sklearn.cluster import KMeans
#Instanciation
kmeans = KMeans(n_clusters=3)
#Exécution
kmeans.fit(X)
#Résultats
groupes_km = kmeans.labels_
groupes_km
array([2, 0, 2, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0])
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(groupes_cah, groupes_km)
0.7814251401120896
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,15))
plt.scatter(components[:,0], components[:,1], c=groupes_cah)
for i, region in enumerate(regions):
x = components[i,0]
y = components[i,1]
plt.annotate(region, (x,y), textcoords="offset points", xytext=(0,10))
plt.grid()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15,15))
plt.scatter(components[:,0], components[:,1], c=groupes_km)
for i, region in enumerate(regions):
x = components[i,0]
y = components[i,1]
plt.annotate(region, (x,y), textcoords="offset points", xytext=(0,10))
plt.grid()