数据挖掘

第四章:分类

2018-10-24  本文已影响1人  无赖宵小

特征值选取

1、选择特征值

2、对特征值相关度进行评分

3、将数据进行标准化(最常用的方法时将所有数据都转化为 0 到 1 之间的值,或使用标准分 z-score —— 分值偏离均值的程度公式如下)

标准化

4、最邻近分类算法

修正的标准分

标准分的问题在于它会受异常值的影响

修正标准分计算方法:将标准分公式中的均值改为中位数,将标准差改为绝对偏差

修正的标准分

中位数指的是将所有数据进行排序,取中间的那个值。如果数据量是偶数,则去中间两个数值的均值。

def getMedian(self, alist):
    """返回中位数"""
    if alist == []:
        return []
    blist = sorted(alist)
    length = len(alist)
    if length % 2 == 1:
        # 列表有奇数个元素,返回中间的元素
        return blist[int(((length + 1) / 2) - 1)]
    else:
        # 列表有偶数个元素,返回中间两个元素的均值
        v1 = blist[int(length / 2)]
        v2 = blist[(int(length / 2) - 1)]
        return (v1 + v2) / 2.0

def getAbsoluteStandardDeviation(self, alist, median):
    """计算绝对偏差"""
    sum = 0
    for item in alist:
        sum += abs(item - median)
    return sum / len(alist)

def normalizeColumn(self, columnNumber):
    """标准化self.data中的第columnNumber列"""
    # 将该列的所有值提取到一个列表中
    col = [v[1][columnNumber] for v in self.data]
    median = self.getMedian(col)asd = self.getAbsoluteStandardDeviation(col, median)
    self.medianAndDeviation.append((median, asd))
    for v in self.data:
        v[1][columnNumber] = (v[1][columnNumber] - median) / asd

最邻近分类算法

def manhattan(vector1, vector2):
    distance = 0
    total = 0
    n = len(vector1)
    for i in range(n):
        distance += abs(vector1[i] - vector2[i])
    return distance

def computeNearestNeighbor(itemName, itemVector, items):
    """按照距离排序,返回邻近物品列表"""
    distances = []
    for otherItem in items:
        if otherItem != itemName:
            distance = manhattan(itemVector, items[otherItem])
            distances.append((distance, otherItem))
    # 最近的排在前面
    distances.sort()
    return distances

def classify(user, itemName, itemVector):    
    nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
    rating = users[user][nearest]
    return rating

关于标准化

“正规化”:将值的范围缩小到 0 和 1 之间

正规化

“标准化”:将特征值转换为均值为 0 的一组数,其中每个数表示偏离均值的程度(即标准偏差或绝对偏差)


# -*- coding:utf-8 -*-

'''
Created on 2018年11月27日

@author: KingSley
'''
from tkinter.tix import COLUMN

class Classifier:
    def __init__(self, filename):
        self.medianAndDeviation = []
        # 从文件中读取数据
        f = open(filename)
        lines = f.readlines()
        f.close()
        self.format = lines[0].strip().split('\t')
        self.data = []
        for line in lines[1:]:
            fields = line.strip().split('\t')
            ignore = []
            vector = []
            for i in range(len(fields)):
                if self.format[i] == 'num':
                    vector.append(float(fields[i]))
                elif self.format[i] == 'comment':
                    ignore.append(fields[i])
                elif self.format[i] == 'class':
                    classification = fields[i]
            self.data.append((classification, vector, ignore))
        self.rawData = list(self.data)
        # get length of instance vector
        self.vlen = len(self.data[0][1])
        # now normalize the data
        for i in range(self.vlen):
            self.normalizeColumn(i)
            
    def getMedian(self, alist):
        """返回中位数"""
        if alist == []:
            return []
        blist = sorted(alist)
        length = len(alist)
        if length % 2 == 1:
            # 列表有奇数个元素,返回中间元素
            return blist[int(((length + 1) / 2) -  1)]
        else:
            # 列表有偶数个元素,返回总量两个元素的均值
            v1 = blist[int(length / 2)]
            v2 =blist[(int(length / 2) - 1)]
            return (v1 + v2) / 2.0

    def getAbsoluteStandardDeviation(self, alist, median):
        """计算绝对偏差"""
        sum = 0
        for item in alist:
            sum += abs(item - median)
        return sum / len(alist)
    
    def normalizeColumn(self, columnNumber):
        """标准化 self.data 中的 columnNumber 列"""
        # 将该列所有值提取到一个列表中
        col = [v[1][columnNumber] for v in self.data]
        median = self.getMedian(col)
        asd = self.getAbsoluteStandardDeviation(col, median)
        #print("Median: %f   ASD = %f" % (median, asd))
        self.medianAndDeviation.append((median, asd))
        for v in self.data:
            v[1][columnNumber] = (v[1][columnNumber] - median) / asd

    def normalizeVector(self, v):
        """对每列的中位数和绝对偏差,计算标准化向量 v"""
        vector = list(v)
        for i in range(len(vector)):
            (median, asd) = self.medianAndDeviation[i]
            vector[i] = (vector[i] - median) / asd
        return vector
    
    def manhattan(self, vector1, vector2):
        """计算曼哈顿距离"""
        return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
    
    def nearestNeighbor(self, itemVector):
        """返回 itemVector 的邻近"""
        return min([(self.manhattan(itemVector, item[1]), item) for item in self.data])
    
    def classify(self, itemVector):
        """预测 itemVector 的分类"""
        return self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]
    

def unitTest():
    classifier = Classifier('athletesTrainingSet.txt')
    br = ('Basketball', [72, 162], ['Brittainey Raven'])
    nl = ('Gymnastics', [61, 76], ['Viktoria Komova'])
    cl = ("Basketball", [74, 190], ['Crystal Langhorne'])
    # first check normalize function
    brNorm = classifier.normalizeVector(br[1])
    nlNorm = classifier.normalizeVector(nl[1])
    clNorm = classifier.normalizeVector(cl[1])
    assert(brNorm == classifier.data[1][1])
    assert(nlNorm == classifier.data[-1][1])
    print('normalizeVector fn OK')
    # check distance
    assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823)
    assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0)
    assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0)
    print('Manhattan distance fn OK')
    # Brittainey Raven's nearest neighbor should be herself
    result = classifier.nearestNeighbor(brNorm)
    assert(result[1][2]== br[2])
    # Nastia Liukin's nearest neighbor should be herself
    result = classifier.nearestNeighbor(nlNorm)
    assert(result[1][2]== nl[2])
    # Crystal Langhorne's nearest neighbor is Jennifer Lacy"
    assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy")
    print("Nearest Neighbor fn OK")
    # Check if classify correctly identifies sports
    assert(classifier.classify(br[1]) == 'Basketball')
    assert(classifier.classify(cl[1]) == 'Basketball')
    assert(classifier.classify(nl[1]) == 'Gymnastics')
    print('Classify fn OK')
    
def test(training_filename, test_filename):
    """Test the classifier on a test set of data"""
    classifier = Classifier(training_filename)
    f = open(test_filename)
    lines = f.readlines()
    f.close()
    numCorrect = 0.0
    for line in lines:
        data = line.strip().split('\t')
        vector = []
        classInColumn = -1
        for i in range(len(classifier.format)):
            if classifier.format[i] == 'num':
                vector.append(float(data[i]))
            elif classifier.format[i] == 'class':
                classInColumn = i
        theClass= classifier.classify(vector)
        prefix = '-'
        if theClass == data[classInColumn]:
            # it is correct
            numCorrect += 1
            prefix = '+'
        print("%s  %12s  %s" % (prefix, theClass, line))
    print("%4.2f%% correct" % (numCorrect * 100/ len(lines)))
        

test('athletesTrainingSet.txt', 'athletesTestSet.txt')
test("irisTrainingSet.data", "irisTestSet.data")
test("mpgTrainingSet.txt", "mpgTestSet.txt")

参考原文作者:Ron Zacharski CC BY-NC 3.0] https://github.com/egrcc/guidetodatamining

参考原文原文 http://guidetodatamining.com/

参考译文来自 @egrcchttps://github.com/egrcc/guidetodatamining
0

上一篇 下一篇

猜你喜欢

热点阅读