机器学习 Day 8 | K-NN算法的简单实现(2)
2018-08-15 本文已影响8人
raphah
机器学习第七天 逐步分析昨日的数字识别
KNN算法伪代码:
def classify0(inX, dataSet, labels, k):
"""
距离度量 度量公式为欧氏距离
inX 测试样本的图像文本转化的向量
dataSet <class 'numpy.ndarray'> 所有训练样本的m*1024的矩阵
labels 存储0~9对应的index位置 len(labels)= m
k 对查询点标签影响显著(效果拔群)。k值小的时候 近似误差小,估计误差大。 k值大 近似误差大,估计误差小。
本例中取 k=3,11个错误
若 k=2, 12个错误
k=4, 11个错误
k=5, 17个错误
"""
# >>print(type(dataSet))
# <class 'numpy.ndarray'>
# shape函数是numpy.core.fromnumeric中的函数,它的功能是读取矩阵的长度,比如shape[0]就是读取矩阵第一维度的长度。
dataSetSize = dataSet.shape[0]
# 原型:numpy.tile(A,reps)
# tile共有2个参数,A指待输入数组,reps则决定A重复的次数。整个函数用于重复数组A来构建新的数组。
# 生成intX * dataSetSize的矩阵
# 例: inX = [1,2,3]
# dataSetSize = 3
# >> print(tile(inX, (dataSetSize, 1)))
# [[3 3 3]
# [3 3 3]
# [3 3 3]]
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
#欧式距离计算
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
# argsort函数返回的是数组值从小到大的索引值,将距离排序:从小到大
sortedDistIndicies = distances.argsort()
# 选取前K个最短距离, 选取这K个中最多的分类类别
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def handwritingClassTest():
# 1. 导入训练数据
hwLabels = []
trainingFileList = listdir(
'/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/trainingDigits/') # load the training set
# 训练集的数量
m = len(trainingFileList)
# 构造 ,m*1024的矩阵
trainingMat = zeros((m, 1024))
# hwLabels存储0~9对应的index位置, trainingMat存放的每个位置对应的图片向量
for i in range(m):
#例: 文件5_135.txt
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
#例:classNumStr =5
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
# 将 32*32的矩阵->1*1024的矩阵
# trainingMat填充1*1024矩阵
trainingMat[i, :] = img2vector(
'/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/trainingDigits/%s' % fileNameStr)
# 2. 导入测试数据
testFileList = listdir(
'/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/testDigits/') # iterate through the test set
errorCount = 0.0
# 测试样本数量
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] # take off .txt
classNumStr = int(fileStr.split('_')[0])
# 将测试样本的32*32的矩阵-->1*1024的矩阵
vectorUnderTest = img2vector(
'/Users/xiehao/Desktop/MachineLearning-master/input/2.KNN/testDigits/%s' % fileNameStr)
#执行上面定义的算法
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount / float(mTest)))