python与机器学习聚类:31省市居民家庭消费水平和学生月上网
2021-10-05 本文已影响0人
Cache_wood
31省市居民家庭消费水平
#建立工程,导入sklearn相关包
import numpy as np
from sklearn.cluster import KMeans
#加载数据,创建k-means算法实例,并进行训练,获得标签
def loadData(filepath):
fr = open(filepath,'r+')
lines = fr.readlines()
retData = []
retCityName = []
for line in lines:
items = line.strip().split(",")
retCityName.append(items[0])
retData.append([float(items[i]) for i in range(1,len(items))])
return retData,retCityName
if __name__ == '__main__':
data,cityName = loadData('E:\coding\sklearn\city.txt')
km = KMeans(n_clusters=4)
label = km.fit_predict(data)
expenses = np.sum(km.cluster_centers_,axis=1)
#print(expense)
#将城市按label分成设定的簇,将每个簇的城市和平均花费输出
CityCluster = [[],[],[],[]]
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
#输出标签,查看结果
for i in range(len(CityCluster)):
print("Expenses:%.2f" % expenses[i])
print(CityCluster[i])
- 聚成四类
Expenses:5678.62
['天津', '浙江', '福建', '重庆', '西藏']
Expenses:4512.27
['江苏', '安徽', '湖南', '湖北', '广西', '海南', '四川', '云南']
Expenses:7754.66
['北京', '上海', '广东']
Expenses:3788.76
['河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '江西', '山东', '河南', '贵州', '陕西', '甘肃', '青海', '宁夏', '新疆']
- 聚成三类
Expenses:5113.54
['天津', '江苏', '浙江', '福建', '湖南', '广西', '海南', '重庆', '四川', '云南', '西藏']
Expenses:3827.87
['河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '安徽', '江西', '山东', '河南', '湖北', '贵州', '陕西', '甘肃', '青海', '宁夏', '新疆']
Expenses:7754.66
['北京', '上海', '广东']
学生月上网时间分布
#建立工程,导入sklearn相关包
import numpy as np
import sklearn.cluster as skc
from sklearn import metrics
import matplotlib.pyplot as plt
#读入数据并进行处理
mac2id = dict()
onlinetimes = []
f = open('E:\coding\sklearn\TestData.txt',encoding='utf-8')
for line in f:
mac = line.split(',')[2]
onlinetime = int(line.split(',')[6])
starttime = int(line.split(',')[4].split(' ')[1].split(':')[0])
if mac not in mac2id:
mac2id[mac] = len(onlinetimes)
onlinetimes.append((starttime,onlinetime))
else:
onlinetimes[mac2id[mac]] = [(starttime,onlinetime)]
real_X = np.array(onlinetimes).reshape((-1,2))
#上网时间聚类,创建DBSCAN算法实例,并进行训练,获得标签
X = real_X[:,0:1]
db = skc.DBSCAN(eps=0.01,min_samples=20).fit(X)
labels = db.labels_
#打印数据被记上的标签,计算标签为-1,即噪声数据的比例
print('Labels:')
print(labels)
ratio = len(labels[labels[:]==-1])/len(labels)
print('Noise ratio:',format(ratio,'.2%'))
#计算簇的个数并打印,评价聚类效果
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estamated number of clusters: %d'% n_clusters_)
print('Silhouette Coefficient: %0.3f' %metrics.silhouette_score(X,labels))
for i in range(n_clusters_):
print('Cluster ',i,':')
print(list(X[labels == i].flatten()))
#画直方图,分析实验结果
plt.hist(X,24)
plt.show()
#每个数据被划分的簇的分类
Labels:
[ 0 -1 0 1 -1 1 0 1 2 -1 1 0 1 1 3 -1 -1 3 -1 1 1 -1 1 3
4 -1 1 1 2 0 2 2 -1 0 1 0 0 0 1 3 -1 0 1 1 0 0 2 -1
1 3 1 -1 3 -1 3 0 1 1 2 3 3 -1 -1 -1 0 1 2 1 -1 3 1 1
2 3 0 1 -1 2 0 0 3 2 0 1 -1 1 3 -1 4 2 -1 -1 0 -1 3 -1
0 2 1 -1 -1 2 1 1 2 0 2 1 1 3 3 0 1 2 0 1 0 -1 1 1
3 -1 2 1 3 1 1 1 2 -1 5 -1 1 3 -1 0 1 0 0 1 -1 -1 -1 2
2 0 1 1 3 0 0 0 1 4 4 -1 -1 -1 -1 4 -1 4 4 -1 4 -1 1 2
2 3 0 1 0 -1 1 0 0 1 -1 -1 0 2 1 0 2 -1 1 1 -1 -1 0 1
1 -1 3 1 1 -1 1 1 0 0 -1 0 -1 0 0 2 -1 1 -1 1 0 -1 2 1
3 1 1 -1 1 0 0 -1 0 0 3 2 0 0 5 -1 3 2 -1 5 4 4 4 -1
5 5 -1 4 0 4 4 4 5 4 4 5 5 0 5 4 -1 4 5 5 5 1 5 5
0 5 4 4 -1 4 4 5 4 0 5 4 -1 0 5 5 5 -1 4 5 5 5 5 4
4]
#噪声数据的比例
Noise ratio: 22.15%
#簇的个数
Estamated number of clusters: 6
#聚类效果评价指标
Silhouette Coefficient: 0.710
Cluster 0 :
[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]
[23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23]
Cluster 2 :
[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
Cluster 3 :
[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21]
Cluster 4 :
[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
Cluster 5 :
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
kmeans实现图像分割
#建立工程并导入sklearn包
import numpy as np
import PIL.Image as image
from sklearn.cluster import KMeans
#加载图片并进行预处理
def loadData(filePath):
f = open(filePath,'rb')
data = []
img = image.open(f)
m,n = img.size
for i in range(m):
for j in range(n):
x,y,z = img.getpixel((i,j))
data.append([x/256.0,y/256.0,z/256.0])
f.close()
return np.mat(data),m,n
#加载kmeans聚类算法
km = KMeans(n_clusters=3)
imgData,row,col = loadData('sklearn/bull.jpg')
#对像素点进行聚类并输出
label = km.fit_predict(imgData)
label = label.reshape([row,col])
pic_new = image.new("L",(row,col))
for i in range(row):
for j in range(col):
pic_new.putpixel((i,j),int(256/(label[i][j]+1)))
pic_new.save("sklearn/result-bull.jpg","JPEG")