第四章:分类
2018-10-24 本文已影响1人
无赖宵小
特征值选取
1、选择特征值
2、对特征值相关度进行评分
3、将数据进行标准化(最常用的方法时将所有数据都转化为 0 到 1 之间的值,或使用标准分 z-score —— 分值偏离均值的程度公式如下)
4、最邻近分类算法
修正的标准分
标准分的问题在于它会受异常值的影响
修正标准分计算方法:将标准分公式中的均值改为中位数,将标准差改为绝对偏差
中位数指的是将所有数据进行排序,取中间的那个值。如果数据量是偶数,则去中间两个数值的均值。
def getMedian(self, alist):
"""返回中位数"""
if alist == []:
return []
blist = sorted(alist)
length = len(alist)
if length % 2 == 1:
# 列表有奇数个元素,返回中间的元素
return blist[int(((length + 1) / 2) - 1)]
else:
# 列表有偶数个元素,返回中间两个元素的均值
v1 = blist[int(length / 2)]
v2 = blist[(int(length / 2) - 1)]
return (v1 + v2) / 2.0
def getAbsoluteStandardDeviation(self, alist, median):
"""计算绝对偏差"""
sum = 0
for item in alist:
sum += abs(item - median)
return sum / len(alist)
def normalizeColumn(self, columnNumber):
"""标准化self.data中的第columnNumber列"""
# 将该列的所有值提取到一个列表中
col = [v[1][columnNumber] for v in self.data]
median = self.getMedian(col)asd = self.getAbsoluteStandardDeviation(col, median)
self.medianAndDeviation.append((median, asd))
for v in self.data:
v[1][columnNumber] = (v[1][columnNumber] - median) / asd
最邻近分类算法
def manhattan(vector1, vector2):
distance = 0
total = 0
n = len(vector1)
for i in range(n):
distance += abs(vector1[i] - vector2[i])
return distance
def computeNearestNeighbor(itemName, itemVector, items):
"""按照距离排序,返回邻近物品列表"""
distances = []
for otherItem in items:
if otherItem != itemName:
distance = manhattan(itemVector, items[otherItem])
distances.append((distance, otherItem))
# 最近的排在前面
distances.sort()
return distances
def classify(user, itemName, itemVector):
nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
rating = users[user][nearest]
return rating
关于标准化
“正规化”:将值的范围缩小到 0 和 1 之间
“标准化”:将特征值转换为均值为 0 的一组数,其中每个数表示偏离均值的程度(即标准偏差或绝对偏差)
# -*- coding:utf-8 -*-
'''
Created on 2018年11月27日
@author: KingSley
'''
from tkinter.tix import COLUMN
class Classifier:
def __init__(self, filename):
self.medianAndDeviation = []
# 从文件中读取数据
f = open(filename)
lines = f.readlines()
f.close()
self.format = lines[0].strip().split('\t')
self.data = []
for line in lines[1:]:
fields = line.strip().split('\t')
ignore = []
vector = []
for i in range(len(fields)):
if self.format[i] == 'num':
vector.append(float(fields[i]))
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
classification = fields[i]
self.data.append((classification, vector, ignore))
self.rawData = list(self.data)
# get length of instance vector
self.vlen = len(self.data[0][1])
# now normalize the data
for i in range(self.vlen):
self.normalizeColumn(i)
def getMedian(self, alist):
"""返回中位数"""
if alist == []:
return []
blist = sorted(alist)
length = len(alist)
if length % 2 == 1:
# 列表有奇数个元素,返回中间元素
return blist[int(((length + 1) / 2) - 1)]
else:
# 列表有偶数个元素,返回总量两个元素的均值
v1 = blist[int(length / 2)]
v2 =blist[(int(length / 2) - 1)]
return (v1 + v2) / 2.0
def getAbsoluteStandardDeviation(self, alist, median):
"""计算绝对偏差"""
sum = 0
for item in alist:
sum += abs(item - median)
return sum / len(alist)
def normalizeColumn(self, columnNumber):
"""标准化 self.data 中的 columnNumber 列"""
# 将该列所有值提取到一个列表中
col = [v[1][columnNumber] for v in self.data]
median = self.getMedian(col)
asd = self.getAbsoluteStandardDeviation(col, median)
#print("Median: %f ASD = %f" % (median, asd))
self.medianAndDeviation.append((median, asd))
for v in self.data:
v[1][columnNumber] = (v[1][columnNumber] - median) / asd
def normalizeVector(self, v):
"""对每列的中位数和绝对偏差,计算标准化向量 v"""
vector = list(v)
for i in range(len(vector)):
(median, asd) = self.medianAndDeviation[i]
vector[i] = (vector[i] - median) / asd
return vector
def manhattan(self, vector1, vector2):
"""计算曼哈顿距离"""
return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
def nearestNeighbor(self, itemVector):
"""返回 itemVector 的邻近"""
return min([(self.manhattan(itemVector, item[1]), item) for item in self.data])
def classify(self, itemVector):
"""预测 itemVector 的分类"""
return self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]
def unitTest():
classifier = Classifier('athletesTrainingSet.txt')
br = ('Basketball', [72, 162], ['Brittainey Raven'])
nl = ('Gymnastics', [61, 76], ['Viktoria Komova'])
cl = ("Basketball", [74, 190], ['Crystal Langhorne'])
# first check normalize function
brNorm = classifier.normalizeVector(br[1])
nlNorm = classifier.normalizeVector(nl[1])
clNorm = classifier.normalizeVector(cl[1])
assert(brNorm == classifier.data[1][1])
assert(nlNorm == classifier.data[-1][1])
print('normalizeVector fn OK')
# check distance
assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823)
assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0)
assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0)
print('Manhattan distance fn OK')
# Brittainey Raven's nearest neighbor should be herself
result = classifier.nearestNeighbor(brNorm)
assert(result[1][2]== br[2])
# Nastia Liukin's nearest neighbor should be herself
result = classifier.nearestNeighbor(nlNorm)
assert(result[1][2]== nl[2])
# Crystal Langhorne's nearest neighbor is Jennifer Lacy"
assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy")
print("Nearest Neighbor fn OK")
# Check if classify correctly identifies sports
assert(classifier.classify(br[1]) == 'Basketball')
assert(classifier.classify(cl[1]) == 'Basketball')
assert(classifier.classify(nl[1]) == 'Gymnastics')
print('Classify fn OK')
def test(training_filename, test_filename):
"""Test the classifier on a test set of data"""
classifier = Classifier(training_filename)
f = open(test_filename)
lines = f.readlines()
f.close()
numCorrect = 0.0
for line in lines:
data = line.strip().split('\t')
vector = []
classInColumn = -1
for i in range(len(classifier.format)):
if classifier.format[i] == 'num':
vector.append(float(data[i]))
elif classifier.format[i] == 'class':
classInColumn = i
theClass= classifier.classify(vector)
prefix = '-'
if theClass == data[classInColumn]:
# it is correct
numCorrect += 1
prefix = '+'
print("%s %12s %s" % (prefix, theClass, line))
print("%4.2f%% correct" % (numCorrect * 100/ len(lines)))
test('athletesTrainingSet.txt', 'athletesTestSet.txt')
test("irisTrainingSet.data", "irisTestSet.data")
test("mpgTrainingSet.txt", "mpgTestSet.txt")
参考原文作者:Ron Zacharski CC BY-NC 3.0] https://github.com/egrcc/guidetodatamining
参考原文原文 http://guidetodatamining.com/
参考译文来自 @egrcc 的 https://github.com/egrcc/guidetodatamining
0