K-近邻法

2018-04-21  本文已影响0人  strealingFire

简单地说,k近邻算法采用测量不同特征值之间的距离方法进行分类。

from numpy import *

import operator

def classify0(inX,dataSet,labels,k):

    dataSetSize = dataSet.shape[0]

    diffMat = tile(inX,(dataSetSize,1))-dataSet

    sqDiffMat = diffMat**2

    sqDistances = sqDiffMat.sum(axis =1)

    distances = sqDistances**0.5

    sortedDistIndicies = distances.argsort()

    classCount = {}

    for i in range(k):

        voteIlabel = labels[sortedDistIndicies[i]]

        classCount[voteIlabel] = classCount.get(voteIlabel,0) +1

        sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)

        return sortedClassCount[0][0]

classify0([1,1],group,labels,3)


def file2matrix(filename):

    fr = open(filename)

    array0lines = fr.readlines()

    number0fLines = len(array0lines)

    #得到文件行数

    returnMat = zeros((number0fLines,3))  #创建返回的Numpy矩阵

    classLabelVector = []

    index= 0

    #解析文件数据到列表

    for line in array0lines:

        line = line.strip()

        listFromLine = line.split('\t')

        returnMat[index,:] = listFromLine[0:3]

        classLabelVector.append(int(listFromLine[-1]))

        index+=1

    return returnMat,classLabelVector

datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')


def autoNorm(dataSet):

    minVals = dataSet.min(0) #这个最小的那个是行和列的交叉坐标

    maxVals = dataSet.max(0)

    ranges = maxVals - minVals

    normDataSet = zeros(shape(dataSet)) #表示按照某种结构建立一个所有值为0的二维数组

    m = dataSet.shape[0]

    normDataSet = dataSet - tile(minVals,(m,1))

    normDataSet = normDataSet/tile(ranges,(m,1))

    return normDataSet,ranges,minVals


normMat ,ranges,minVals = autoNorm(datingDataMat)


def datingClassTest():

    hoRatio = 0.10

    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')

    normMat,ranges,minVals =autoNorm(datingDataMat)

    m = normMat.shape[0]

    numTestVecs = int(m*hoRatio)

    errorCount = 0.0

    for i in range(numTestVecs):

        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)

        print ("the classifier came back with: %d,the real answer is : %d",(classifierResult,datingLabels[i]))

        if(classifierResult != datingLabels[i]): errorCount +=1.0

    print ("the total error rate is:",(errorCount/float(numTestVecs)))


def classifyPerson():

    resultList = ['not at all','in small doses','in large doses']

    percentTats = float(input("percentage of time spent playing video games?"))

    ffMiles = float(input("frequent flier miles earned per year ?"))

    iceCream = float(input("liters of ice cream consumed per year?"))

    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')

    normMat,ranges,minVals = autoNorm(datingDataMat)

    inArr = array([ffMiles,percentTats,iceCream])

    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)

    print("You will probably like this person:",resultList[classifierResult])

上一篇 下一篇

猜你喜欢

热点阅读