01用户聚类分群

2022-12-11  本文已影响0人  Jachin111

导入库和数据

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
py.offline.init_notebook_mode(connected=True)
df = pd.read_csv("Mall_Customers.csv")
df.head()
image.png

数据EDA

# 数据探索
df.shape
image.png
# 缺失值情况
df.isnull().sum()
image.png
# 数据类型
df.dtypes
image.png
# 描述性统计信息
df.describe()
image.png
# 设置绘图风格
plt.style.use("fivethirtyeight")

# 取出重点分析的3个字段
cols = df.columns[2:].tolist()
cols
image.png

3个属性直方图

plt.figure(1,figsize=(15,6))
n = 0

for col in cols:
    n += 1
    plt.subplot(1,3,n)
    plt.subplots_adjust(hspace=0.5,wspace=0.5)
    sns.distplot(df[col],bins=20)[图片上传中...(image.png-da7900-1670846236525-0)]

    plt.title(f'Distplot of {col}')
plt.show()
image.png

性别因素

# 性别人数统计
plt.figure(1,figsize=(12,5))
sns.countplot(y="Gender",data=df)
plt.show()
image.png
# 不同性别下的数据分布
sns.pairplot(df.drop(["CustomerID"],axis=1),
            hue="Gender",
            aspect=1.5)
plt.show()
image.png
# 不同性别下年龄和平均收入的关系
plt.figure(1,figsize=(15,6))

for gender in ["Male","Female"]:
    plt.scatter(x="Age",y="Annual Income (k$)",
               data=df[df["Gender"]==gender],
               s=200,alpha=0.5,label=gender)
    
plt.xlabel("Age")
plt.ylabel("Annual Income(k$)")
plt.title("Age vs Annual Income w.r.t Gender")
plt.show()
image.png
# 不同性别下平均收入和消费得分的关系
plt.figure(1,figsize=(15,6))

for gender in ["Male","Female"]:
    plt.scatter(x="Annual Income (k$)",y="Spending Score (1-100)",
               data=df[df["Gender"]==gender],
               s=200,alpha=0.5,label=gender)
    
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.title("Annual Income vs Spending Score w.r.t Gender")
plt.show()
image.png
# 不同性别下的数据分布情况
plt.figure(1,figsize=(15,7))
n = 0

for col in cols:
    n += 1
    plt.subplot(1,3,n)
    plt.subplots_adjust(hspace=0.5,wspace=0.5)
    sns.violinplot(x=col,y="Gender",data=df,palette="vlag")
    sns.swarmplot(x=col,y="Gender",data=df)
    plt.ylabel("Gender" if n == 1 else '')
    plt.title("Violinplots & Swarmplots" if n == 2 else '')
    
plt.show()
image.png

属性相关性分析

cols = ['Age','Annual Income (k$)','Spending Score (1-100)']

plt.figure(1,figsize=(15,6))
n = 0

for x in cols:
    for y in cols:
        n += 1
        plt.subplot(3,3,n)
        plt.subplots_adjust(hspace=0.5,wspace=0.5)
        sns.regplot(x=x,y=y,data=df,color="#AE213D")
        plt.ylabel(y.split()[0] + " " + y.split()[1] if len(y.split()) > 1 else y)
        
plt.show()
image.png

两个属性间的聚类

# K值选取

df1 = df[['Age','Spending Score (1-100)']].iloc[:,:].values
inertia = []

for k in range(1,11):
    algorithm = (KMeans(n_clusters=k,
                       init="k-means++",
                       n_init=10,
                       max_iter=300,
                       tol=0.0001,
                       random_state=111,
                       algorithm="full"))
    algorithm.fit(df1)
    inertia.append(algorithm.inertia_)    #质心之和
    
inertia

[171535.50000000003,
75949.15601023012,
45840.67661610871,
28165.583566629342,
23830.033602505435,
19576.013221355326,
15514.193134351033,
13020.333585858589,
11480.04547827173,
10187.84610136452]

# 绘制出K值的变化和质心距离之和的关系
plt.figure(1,figsize=(15,6))
plt.plot(np.arange(1,11),inertia,'o')
plt.plot(np.arange(1,11),inertia,'-',alpha=0.5)

plt.xlabel("Choose of K")
plt.ylabel("Interia")
plt.show()
image.png
# 聚类建模
algorithm = (KMeans(n_clusters=4,
                   init="k-means++",
                   n_init=10,
                   max_iter=300,
                   tol=0.0001,
                   random_state=111,
                   algorithm="elkan"))
algorithm.fit(df1)

labels1 = algorithm.labels_
centroids1 = algorithm.cluster_centers_

print("labels1:",labels1)
print("centroids1:",centroids1)

labels1: [0 2 1 2 0 2 1 2 1 2 1 2 1 2 1 2 0 0 1 2 0 2 1 2 1 2 1 0 1 2 1 2 1 2 1 2 1
2 1 2 3 2 3 0 1 0 3 0 0 0 3 0 0 3 3 3 3 3 0 3 3 0 3 3 3 0 3 3 0 0 3 3 3 3
3 0 3 0 0 3 3 0 3 3 0 3 3 0 0 3 3 0 3 0 0 0 3 0 3 0 0 3 3 0 3 0 3 3 3 3 3
0 0 0 0 0 3 3 3 3 0 0 0 2 0 2 3 2 1 2 1 2 0 2 1 2 1 2 1 2 1 2 0 2 1 2 3 2
1 2 1 2 1 2 1 2 1 2 1 2 3 2 1 2 1 2 1 2 1 0 1 2 1 2 1 2 1 2 1 2 1 2 1 2 0
2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
centroids1: [[27.61702128 49.14893617]
[43.29166667 15.02083333]
[30.1754386 82.35087719]
[55.70833333 48.22916667]]

# 展示分类效果
df3 = pd.DataFrame(df1,columns=["Age","Spending Score (1-100)"])
df3
image.png
df3["Labels"] = labels1
df3
image.png
px.scatter(df3,x="Age",y="Spending Score (1-100)",color="Labels",color_continuous_scale="rainbow")
image.png
上一篇下一篇

猜你喜欢

热点阅读