kmeans实例及代码
2019-02-13 本文已影响0人
morie_li
聚类和决策树一样,属于无监督学习。也就是说数据样本只有特征x,没有给定y。聚类的目的是找到样本特征潜在的类别,将同类别的样本放在一起。
kmeans的具体逻辑如下:
1.随机选取k个簇心;
2.对于每一个样例,计算其属于的类;
3.循环完所有的样例后,重新计算每个簇的簇心;
4.重复第二步第三部,直到簇心不再变化或达到最大迭代值。
import numpy as np
import matplotlib.pyplot as plt
#读取数据
def loaddate(filename):
datamat = []
fr = open(filename)
for line in fr.readlines():
datamat.append(map(float, line.strip().split('\t')))
datamat = pd.DataFrame(datamat)
return datamat
#计算欧式距离
def distance(vecA, vecB):
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
#产生随机簇心
def getcenter(data, k):
_, n = data.shape
center = pd.DataFrame(np.zeros([k, 2]))
for i in range(n):
minJ = np.min(data.iloc[:, i])
maxJ = np.max(data.iloc[:, i])
rangJ = float(maxJ - minJ)
center.iloc[:, i] = minJ +rangJ * np.random.rand(k, 1)
return center
#计算每个样例归属的簇,并重新计算簇心
def kmeans(data, k, maxiter):
ceter = getcenter(data, k)
m, n = data.shape
position = pd.DataFrame(np.zeros((m, 2)), columns = ['dis', 'cindex'])
#position第一列放置样例距簇心的距离,第二列放置样本归属的簇心
itercount = 0
#迭代次数,如果迭代次数超过最大迭代次数,则停止
clusterchange = True
#簇心是否发生变化
while itercount < maxiter and clusterchange:
itercount += 1
clusterchange = False
for i in range(m):
minindex = 0
mindist = np.inf
#设置position的初始值
for j in range(k):
dis = distance(data.iloc[i, :], ceter.iloc[j, :])
#计算欧式距离
if dis < mindist:
minindex = j
mindist = dis
#替换掉初始化的数据
if position.iloc[i, 1] != minindex:
clusterchange = True
#观察归属的簇是否发生变化
position.iloc[i, :] = mindist, minindex
for cent in range(k):
ptscluster = data.iloc[list(position.loc[position['cindex'] == cent, 'cindex'].index), :]
if ptscluster.shape[0] > 0:
ceter.iloc[cent, :] = np.mean(ptscluster, axis = 0)
#计算新的簇心
return ceter
if __name__ == '__main__':
file = loaddate('testSet.txt')
ceter = kmeans(file, 2, 5)
plt.scatter(file.iloc[:, 0], file.iloc[:, 1], marker = '*', c = 'b')
plt.scatter(ceter.iloc[:, 0], ceter.iloc[:, 1], marker = 'o', c = 'r')
plt.show()