机器学习之KNN分类算法
2018-11-26 本文已影响5人
Athenaearl
优缺点和适用范围:
优点:
- 高精确性
- 对异常值不敏感
缺点:
- 内存消耗大
- 计算费时
适用的情况:
- 数值型数据和非数值型数据都可使用
原理
dating_data_mat = array([[1.0, 1.0], [1.1, 1.2], [1.2, 1.1], [1.3, 0.9], [0.1, 0.2], [0.1, 0.1], [0.0, 0.1], [0.1, 0.3]])
dating_labels = [1, 1, 1, 1, 2, 2, 2, 2]
fig = plt.figure() # create a new figure
ax = fig.add_subplot(111) # 349 代表将整个画布分成3行4列,只使用第9块,111 相当于使用整个画布
# 第0列作为x轴,第1列作为y轴, 根据标签换颜色 15 代表点的大小,1 就非常小
ax.scatter(dating_data_mat[:, 0], dating_data_mat[:, 1], 15*array(dating_labels), 15*array(dating_labels))
plt.show() # 画
得到图:
得到的散点图
这种情况下如果想要新来一个点 (0.3, 0.1) 究竟应该属于哪一个区??
KNN的想法就是,这个点距离哪一个区中的点更近就属于哪一个区
更具体的说法:
- 计算新来的点到训练样本中的每一个点的距离
- 根据距离,将训练样本中的所有数据进行从小到大排序
- 取前k个,看这k个大多数属于哪一个区的,这就是新来的点所属的区
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
# def create_data_set():
# group = array([[1.0, 1.1, 1.0], [1.0, 1.0, 1.1], [0, 0, 0], [0, 0.1, 0.1]]) # sample , array is defined in numpy
# labels = ['A', 'A', 'B', 'B']
# return group, labels # python can return more than one parameter
def classify0(in_x, data_set, labels, k):
# calculate the distance
# get the length of the first dimension of dataSet, for instance, the 'group' defined in createDataSet()
# group.shape is a tuple (4, 2) and group.shape[0] is 4
data_set_size = data_set.shape[0]
# tile() parameter inX -> the input vector, (dataSetSize, 1) -> a tuple
# tile(inX, (dataSetSize, 1)) -> return a list who has dataSetSize one dimension list, every list is 1* inX
diff_mat = tile(in_x, (data_set_size, 1)) - data_set # get the difference
sq_diff_mat = diff_mat**2 # get the square
# sum up, axis=0 级别最大的表相加, axis 越大 越选择级别小的表相加, 这里 axis = 1 选了最大值,因此最小的表内部加
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances**0.5 # sqrt
sorted_dist_indicies = distances.argsort() # sort, give the index of sorted one
class_count = {} # empty dictionary
for i in range(k): # choose the first k elements
vote_ilabel = labels[sorted_dist_indicies[i]]
# if this key does not exist, create one map, return the value of it
class_count[vote_ilabel] = class_count.get(vote_ilabel, 0) + 1
# key=operator.itemgetter 表示根据哪一列的数据进行排序,itemgetter(1) 根据第二列数据进行排序,这里就是距离
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) # reverse=True 逆排序
return sorted_class_count[0][0]
# get data from text
def file2matrix(filename):
fr = open(filename) # get the object of file
array_on_lines = fr.readlines() # give the object of current line, iterable
number_of_lines = len(array_on_lines) # get the number of lines
return_mat = zeros((number_of_lines, 3)) # get a matrix which is all 0 and number_of_lines * 3
class_label_vector = []
index = 0
for line in array_on_lines:
line = line.strip() # remove the white space of one line
list_from_line = line.split('\t') # split the string
return_mat[index, :] = list_from_line[0: 3] # the index line get value
class_label_vector.append(int(list_from_line[-1])) # the last one element in this line
index += 1
return return_mat, class_label_vector
# dating_data_mat = array([[1.0, 1.0], [1.1, 1.2], [1.2, 1.1],[1.3, 0.9],[0.1, 0.2],[0.1, 0.1],[0.0, 0.1],[0.1, 0.3]])
# dating_labels = [1, 1, 1, 1, 2, 2, 2, 2]
# fig = plt.figure() # create a new figure
# ax = fig.add_subplot(111) # 349 代表将整个画布分成3行4列,只使用第9块,111 相当于使用整个画布
# # 第0列作为x轴,第1列作为y轴, 根据标签换颜色 15 代表点的大小,1 就非常小
# ax.scatter(dating_data_mat[:, 0], dating_data_mat[:, 1], 15*array(dating_labels), 15*array(dating_labels))
# plt.show() # 画
# 归一化数据,如果没有这一步骤,不同列数据的差值大小不同,比如一列差100很正常,另一列差0.1都是大的
# 如果这样,差100的列显然在计算中占据了更大的比重,但是我们并不想这样,因此需要数据归一化,即让差值都在0~1之间
def auto_norm(data_set):
min_vals = data_set.min(0) # get the min in every col
max_vals = data_set.max(0) # get the max in every col
ranges = max_vals - min_vals # get the range in every col
norm_data_set = zeros(shape(data_set)) # get a all-zero matrix which is same-size with data_set
m = data_set.shape[0] # get the number of line
norm_data_set = data_set - tile(min_vals, (m, 1)) # 获得和最小值的差值
norm_data_set = norm_data_set/tile(ranges, (m, 1)) # 差值除以range
return norm_data_set, ranges, min_vals
# 检验分类器的分类效果
def dating_class_test():
ho_ratio = 0.10 # 检测的数据占样本总数的10%
dating_data_mat, dating_labels = file2matrix('datingTestSet2.txt')
norm_mat, ranges, min_vals = auto_norm(dating_data_mat)
m = norm_mat.shape[0]
num_test_vecs = int(m*ho_ratio)
error_count = 0.0
for i in range(num_test_vecs): # norm_mat[num_test_vecs:m, :] 可见检测用数据不在训练样本中
classifier_result = classify0(norm_mat[i, :], norm_mat[num_test_vecs:m, :], dating_labels[num_test_vecs: m], 3)
print("the classfier came back with: %d, the real answer is : %d" % (classifier_result, dating_labels[i]))
if classifier_result != dating_labels[i]:
error_count += 1.0
print("the total error rate is : %f" % (error_count/float(num_test_vecs)))
此为示例代码,来自《机器学习实战》by Peter Harrington