01用户聚类分群
2022-12-11 本文已影响0人
Jachin111
导入库和数据
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
py.offline.init_notebook_mode(connected=True)
df = pd.read_csv("Mall_Customers.csv")
df.head()
image.png
数据EDA
# 数据探索
df.shape
image.png
# 缺失值情况
df.isnull().sum()
image.png
# 数据类型
df.dtypes
image.png
# 描述性统计信息
df.describe()
image.png
# 设置绘图风格
plt.style.use("fivethirtyeight")
# 取出重点分析的3个字段
cols = df.columns[2:].tolist()
cols
image.png
3个属性直方图
plt.figure(1,figsize=(15,6))
n = 0
for col in cols:
n += 1
plt.subplot(1,3,n)
plt.subplots_adjust(hspace=0.5,wspace=0.5)
sns.distplot(df[col],bins=20)[图片上传中...(image.png-da7900-1670846236525-0)]
plt.title(f'Distplot of {col}')
plt.show()
image.png
性别因素
# 性别人数统计
plt.figure(1,figsize=(12,5))
sns.countplot(y="Gender",data=df)
plt.show()
image.png
# 不同性别下的数据分布
sns.pairplot(df.drop(["CustomerID"],axis=1),
hue="Gender",
aspect=1.5)
plt.show()
image.png
# 不同性别下年龄和平均收入的关系
plt.figure(1,figsize=(15,6))
for gender in ["Male","Female"]:
plt.scatter(x="Age",y="Annual Income (k$)",
data=df[df["Gender"]==gender],
s=200,alpha=0.5,label=gender)
plt.xlabel("Age")
plt.ylabel("Annual Income(k$)")
plt.title("Age vs Annual Income w.r.t Gender")
plt.show()
image.png
# 不同性别下平均收入和消费得分的关系
plt.figure(1,figsize=(15,6))
for gender in ["Male","Female"]:
plt.scatter(x="Annual Income (k$)",y="Spending Score (1-100)",
data=df[df["Gender"]==gender],
s=200,alpha=0.5,label=gender)
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.title("Annual Income vs Spending Score w.r.t Gender")
plt.show()
image.png
# 不同性别下的数据分布情况
plt.figure(1,figsize=(15,7))
n = 0
for col in cols:
n += 1
plt.subplot(1,3,n)
plt.subplots_adjust(hspace=0.5,wspace=0.5)
sns.violinplot(x=col,y="Gender",data=df,palette="vlag")
sns.swarmplot(x=col,y="Gender",data=df)
plt.ylabel("Gender" if n == 1 else '')
plt.title("Violinplots & Swarmplots" if n == 2 else '')
plt.show()
image.png
属性相关性分析
cols = ['Age','Annual Income (k$)','Spending Score (1-100)']
plt.figure(1,figsize=(15,6))
n = 0
for x in cols:
for y in cols:
n += 1
plt.subplot(3,3,n)
plt.subplots_adjust(hspace=0.5,wspace=0.5)
sns.regplot(x=x,y=y,data=df,color="#AE213D")
plt.ylabel(y.split()[0] + " " + y.split()[1] if len(y.split()) > 1 else y)
plt.show()
image.png
两个属性间的聚类
# K值选取
df1 = df[['Age','Spending Score (1-100)']].iloc[:,:].values
inertia = []
for k in range(1,11):
algorithm = (KMeans(n_clusters=k,
init="k-means++",
n_init=10,
max_iter=300,
tol=0.0001,
random_state=111,
algorithm="full"))
algorithm.fit(df1)
inertia.append(algorithm.inertia_) #质心之和
inertia
[171535.50000000003,
75949.15601023012,
45840.67661610871,
28165.583566629342,
23830.033602505435,
19576.013221355326,
15514.193134351033,
13020.333585858589,
11480.04547827173,
10187.84610136452]
# 绘制出K值的变化和质心距离之和的关系
plt.figure(1,figsize=(15,6))
plt.plot(np.arange(1,11),inertia,'o')
plt.plot(np.arange(1,11),inertia,'-',alpha=0.5)
plt.xlabel("Choose of K")
plt.ylabel("Interia")
plt.show()
image.png
# 聚类建模
algorithm = (KMeans(n_clusters=4,
init="k-means++",
n_init=10,
max_iter=300,
tol=0.0001,
random_state=111,
algorithm="elkan"))
algorithm.fit(df1)
labels1 = algorithm.labels_
centroids1 = algorithm.cluster_centers_
print("labels1:",labels1)
print("centroids1:",centroids1)
labels1: [0 2 1 2 0 2 1 2 1 2 1 2 1 2 1 2 0 0 1 2 0 2 1 2 1 2 1 0 1 2 1 2 1 2 1 2 1
2 1 2 3 2 3 0 1 0 3 0 0 0 3 0 0 3 3 3 3 3 0 3 3 0 3 3 3 0 3 3 0 0 3 3 3 3
3 0 3 0 0 3 3 0 3 3 0 3 3 0 0 3 3 0 3 0 0 0 3 0 3 0 0 3 3 0 3 0 3 3 3 3 3
0 0 0 0 0 3 3 3 3 0 0 0 2 0 2 3 2 1 2 1 2 0 2 1 2 1 2 1 2 1 2 0 2 1 2 3 2
1 2 1 2 1 2 1 2 1 2 1 2 3 2 1 2 1 2 1 2 1 0 1 2 1 2 1 2 1 2 1 2 1 2 1 2 0
2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
centroids1: [[27.61702128 49.14893617]
[43.29166667 15.02083333]
[30.1754386 82.35087719]
[55.70833333 48.22916667]]
# 展示分类效果
df3 = pd.DataFrame(df1,columns=["Age","Spending Score (1-100)"])
df3
image.png
df3["Labels"] = labels1
df3
image.png
px.scatter(df3,x="Age",y="Spending Score (1-100)",color="Labels",color_continuous_scale="rainbow")
image.png