论文算法

k-Means++/FCM/凝聚层次聚类/DBSCAN

2016-05-21  本文已影响1049人  胡哈哈哈

参考自初识聚类算法:K均值、凝聚层次聚类和DBSCAN模糊聚类FCM算法

聚类的目的

将数据划分为若干个簇,簇内相似性大,簇间相似性小,聚类效果好。用于从数据中提取信息和规律。

聚类的概念

簇的分类

常用的聚类分析算法

聚类算法的基本思想

(1) 基本k均值聚类(hard c-means, HCM)

方法很简单,首先给出初始的几个簇中心。将所有元素按照到簇中心最近的归属原则,归属到各个簇。然后对各个簇求解新的簇中心(元素集合质心)。重复上述步骤直到质心不再明显变化后,即完成聚类。

采用何种距离可按照数据性质或项目要求。距离的分类可以参考A-star算法概述及其在游戏开发中的应用分析中提到的曼哈顿距离、对角线距离、欧几里得距离等。实际上相当于求解一个全局状态函数的最小值问题,状态函数是各个元素到最近簇中心的距离之和。

该算法的特点有如下几点:

k-Means++

python代码

此代码使用的是k-means++算法,采用约定的方法使得到的初始聚类中心能够在后面的迭代过程中收敛到最优解。

import math
import collections
import random
import copy
import pylab

try:
    import psyco
    psyco.full()
except ImportError:
    pass

FLOAT_MAX = 1e100

class Point:
    __slots__ = ["x", "y", "group"]
    def __init__(self, x = 0, y = 0, group = 0):
        self.x, self.y, self.group = x, y, group

def generatePoints(pointsNumber, radius):
    points = [Point() for _ in xrange(pointsNumber)]
    for point in points:
        r = random.random() * radius
        angle = random.random() * 2 * math.pi
        point.x = r * math.cos(angle)
        point.y = r * math.sin(angle)
    return points

def solveDistanceBetweenPoints(pointA, pointB):
    return (pointA.x - pointB.x) * (pointA.x - pointB.x) + (pointA.y - pointB.y) * (pointA.y - pointB.y)

def getNearestCenter(point, clusterCenterGroup):
    minIndex = point.group
    minDistance = FLOAT_MAX
    for index, center in enumerate(clusterCenterGroup):
        distance = solveDistanceBetweenPoints(point, center)
        if (distance < minDistance):
            minDistance = distance
            minIndex = index
    return (minIndex, minDistance)

def kMeansPlusPlus(points, clusterCenterGroup):
    clusterCenterGroup[0] = copy.copy(random.choice(points))
    distanceGroup = [0.0 for _ in xrange(len(points))]
    sum = 0.0
    for index in xrange(1, len(clusterCenterGroup)):
        for i, point in enumerate(points):
            distanceGroup[i] = getNearestCenter(point, clusterCenterGroup[:index])[1]
            sum += distanceGroup[i]
        sum *= random.random()
        for i, distance in enumerate(distanceGroup):
            sum -= distance;
            if sum < 0:
                clusterCenterGroup[index] = copy.copy(points[i])
                break
    for point in points:
        point.group = getNearestCenter(point, clusterCenterGroup)[0]
    return

def kMeans(points, clusterCenterNumber):
    clusterCenterGroup = [Point() for _ in xrange(clusterCenterNumber)]
    kMeansPlusPlus(points, clusterCenterGroup)
    clusterCenterTrace = [[clusterCenter] for clusterCenter in clusterCenterGroup]
    tolerableError, currentError = 5.0, FLOAT_MAX
    count = 0
    while currentError >= tolerableError:
        count += 1
        countCenterNumber = [0 for _ in xrange(clusterCenterNumber)]
        currentCenterGroup = [Point() for _ in xrange(clusterCenterNumber)]
        for point in points:
            currentCenterGroup[point.group].x += point.x
            currentCenterGroup[point.group].y += point.y
            countCenterNumber[point.group] += 1
        for index, center in enumerate(currentCenterGroup):
            center.x /= countCenterNumber[index]
            center.y /= countCenterNumber[index]
        currentError = 0.0
        for index, singleTrace in enumerate(clusterCenterTrace):
            singleTrace.append(currentCenterGroup[index])
            currentError += solveDistanceBetweenPoints(singleTrace[-1], singleTrace[-2])
            clusterCenterGroup[index] = copy.copy(currentCenterGroup[index])
        for point in points:
            point.group = getNearestCenter(point, clusterCenterGroup)[0]
    return clusterCenterGroup, clusterCenterTrace

def showClusterAnalysisResults(points, clusterCenterTrace):
    colorStore = ['or', 'og', 'ob', 'oc', 'om', 'oy', 'ok']
    pylab.figure(figsize=(9, 9), dpi = 80)
    for point in points:
        color = ''
        if point.group >= len(colorStore):
            color = colorStore[-1]
        else:
            color = colorStore[point.group]
        pylab.plot(point.x, point.y, color)
    for singleTrace in clusterCenterTrace:
        pylab.plot([center.x for center in singleTrace], [center.y for center in singleTrace], 'k')
    pylab.show()

def main():
    clusterCenterNumber = 5
    pointsNumber = 2000
    radius = 10
    points = generatePoints(pointsNumber, radius)
    _, clusterCenterTrace = kMeans(points, clusterCenterNumber)
    showClusterAnalysisResults(points, clusterCenterTrace)

main()

(1)Extra 基于模糊数学的c均值聚类(FCM)

模糊c均值聚类(fuzzy c-means clustering)与硬划分k均值聚类相同,都是一种基于划分的聚类分析方法,但FCM是HCM的自然进阶版。与k均值聚类不同的是,模糊c均值聚类的点按照不同的隶属度ui隶属于不同的聚类中心vi,聚类的过程类似k均值聚类。(详见:模糊聚类FCM算法)

聚类步骤:

FCM

python代码

import math
import collections
import random
import copy
import pylab

try:
    import psyco
    psyco.full()
except ImportError:
    pass

FLOAT_MAX = 1e100

class Point:
    __slots__ = ["x", "y", "group", "membership"]
    def __init__(self, clusterCenterNumber, x = 0, y = 0, group = 0):
        self.x, self.y, self.group = x, y, group
        self.membership = [0.0 for _ in xrange(clusterCenterNumber)]

def generatePoints(pointsNumber, radius, clusterCenterNumber):
    points = [Point(clusterCenterNumber) for _ in xrange(2 * pointsNumber)]
    count = 0
    for point in points:
        count += 1
        r = random.random() * radius
        angle = random.random() * 2 * math.pi
        point.x = r * math.cos(angle)
        point.y = r * math.sin(angle)
        if count == pointsNumber - 1:
            break
    for index in xrange(pointsNumber, 2 * pointsNumber):
        points[index].x = 2 * radius * random.random() - radius
        points[index].y = 2 * radius * random.random() - radius
    return points
    

def solveDistanceBetweenPoints(pointA, pointB):
    return (pointA.x - pointB.x) * (pointA.x - pointB.x) + (pointA.y - pointB.y) * (pointA.y - pointB.y)

def getNearestCenter(point, clusterCenterGroup):
    minIndex = point.group
    minDistance = FLOAT_MAX
    for index, center in enumerate(clusterCenterGroup):
        distance = solveDistanceBetweenPoints(point, center)
        if (distance < minDistance):
            minDistance = distance
            minIndex = index
    return (minIndex, minDistance)

def kMeansPlusPlus(points, clusterCenterGroup):
    clusterCenterGroup[0] = copy.copy(random.choice(points))
    distanceGroup = [0.0 for _ in xrange(len(points))]
    sum = 0.0
    for index in xrange(1, len(clusterCenterGroup)):
        for i, point in enumerate(points):
            distanceGroup[i] = getNearestCenter(point, clusterCenterGroup[:index])[1]
            sum += distanceGroup[i]
        sum *= random.random()
        for i, distance in enumerate(distanceGroup):
            sum -= distance;
            if sum < 0:
                clusterCenterGroup[index] = copy.copy(points[i])
                break
    return

def fuzzyCMeansClustering(points, clusterCenterNumber, weight):
    clusterCenterGroup = [Point(clusterCenterNumber) for _ in xrange(clusterCenterNumber)]
    kMeansPlusPlus(points, clusterCenterGroup)
    clusterCenterTrace = [[clusterCenter] for clusterCenter in clusterCenterGroup]
    tolerableError, currentError = 1.0, FLOAT_MAX
    while currentError >= tolerableError:
        for point in points:
            getSingleMembership(point, clusterCenterGroup, weight)
        currentCenterGroup = [Point(clusterCenterNumber) for _ in xrange(clusterCenterNumber)]
        for centerIndex, center in enumerate(currentCenterGroup):
            upperSumX, upperSumY, lowerSum = 0.0, 0.0, 0.0
            for point in points:
                membershipWeight = pow(point.membership[centerIndex], weight)
                upperSumX += point.x * membershipWeight
                upperSumY += point.y * membershipWeight
                lowerSum += membershipWeight
            center.x = upperSumX / lowerSum
            center.y = upperSumY / lowerSum
        # update cluster center trace
        currentError = 0.0
        for index, singleTrace in enumerate(clusterCenterTrace):
            singleTrace.append(currentCenterGroup[index])
            currentError += solveDistanceBetweenPoints(singleTrace[-1], singleTrace[-2])
            clusterCenterGroup[index] = copy.copy(currentCenterGroup[index])
    for point in points:
        maxIndex, maxMembership = 0, 0.0
        for index, singleMembership in enumerate(point.membership):
            if singleMembership > maxMembership:
                maxMembership = singleMembership
                maxIndex = index
        point.group = maxIndex
    return clusterCenterGroup, clusterCenterTrace

def getSingleMembership(point, clusterCenterGroup, weight):
    distanceFromPoint2ClusterCenterGroup = [solveDistanceBetweenPoints(point, clusterCenterGroup[index]) for index in xrange(len(clusterCenterGroup))]
    for centerIndex, singleMembership in enumerate(point.membership):
        sum = 0.0
        isCoincide = [False, 0]
        for index, distance in enumerate(distanceFromPoint2ClusterCenterGroup):
            if distance == 0:
                isCoincide[0] = True
                isCoincide[1] = index
                break
            sum += pow(float(distanceFromPoint2ClusterCenterGroup[centerIndex] / distance), 1.0 / (weight - 1.0))
        if isCoincide[0]:
            if isCoincide[1] == centerIndex:
                point.membership[centerIndex] = 1.0
            else:
                point.membership[centerIndex] = 0.0
        else:
            point.membership[centerIndex] = 1.0 / sum

def showClusterAnalysisResults(points, clusterCenterTrace):
    colorStore = ['or', 'og', 'ob', 'oc', 'om', 'oy', 'ok']
    pylab.figure(figsize=(9, 9), dpi = 80)
    for point in points:
        color = ''
        if point.group >= len(colorStore):
            color = colorStore[-1]
        else:
            color = colorStore[point.group]
        pylab.plot(point.x, point.y, color)
    for singleTrace in clusterCenterTrace:
        pylab.plot([center.x for center in singleTrace], [center.y for center in singleTrace], 'k')
    pylab.show()

def main():
    clusterCenterNumber = 5
    pointsNumber = 2000
    radius = 10
    weight = 2
    points = generatePoints(pointsNumber, radius, clusterCenterNumber)
    _, clusterCenterTrace = fuzzyCMeansClustering(points, clusterCenterNumber, weight)
    showClusterAnalysisResults(points, clusterCenterTrace)

main()

该算法的特点有如下几点:

(2) 凝聚层次聚类

初始状态各个元素各自为簇,每次合并簇间距离最小的簇。直到簇个数满足要求或合并超过90%。类似huffman树算法和查并集。上述距离的定义也有几种分类:包括簇间元素的最小距离,簇间元素的最大距离,和簇质心距离。

该算法的特点有如下几点:

凝聚层次聚类

python代码

import math
import collections
import random
import copy
import pylab

try:
    import psyco
    psyco.full()
except ImportError:
    pass

FLOAT_MAX = 1e100

class Point:
    __slots__ = ["x", "y", "group"]
    def __init__(self, x = 0, y = 0, group = 0):
        self.x, self.y, self.group = x, y, group

def generatePoints(pointsNumber, radius):
    points = [Point() for _ in xrange(4 * pointsNumber)]
    originX = [-radius, -radius, radius, radius]
    originY = [-radius, radius, -radius, radius]
    count = 0
    countCenter = 0
    for index, point in enumerate(points):
        count += 1
        r = random.random() * radius
        angle = random.random() * 2 * math.pi
        point.x = r * math.cos(angle) + originX[countCenter]
        point.y = r * math.sin(angle) + originY[countCenter]
        point.group = index
        if count >= pointsNumber * (countCenter + 1):
            countCenter += 1    
    return points

def solveDistanceBetweenPoints(pointA, pointB):
    return (pointA.x - pointB.x) * (pointA.x - pointB.x) + (pointA.y - pointB.y) * (pointA.y - pointB.y)

def getDistanceMap(points):
    distanceMap = {}
    for i in xrange(len(points)):
        for j in xrange(i + 1, len(points)):
            distanceMap[str(i) + '#' + str(j)] = solveDistanceBetweenPoints(points[i], points[j])
    distanceMap = sorted(distanceMap.iteritems(), key=lambda dist:dist[1], reverse=False)
    return distanceMap

def agglomerativeHierarchicalClustering(points, distanceMap, mergeRatio, clusterCenterNumber):
    unsortedGroup = {index: 1 for index in xrange(len(points))}
    for key, _ in distanceMap:
        lowIndex, highIndex = int(key.split('#')[0]), int(key.split('#')[1])
        if points[lowIndex].group != points[highIndex].group:
            lowGroupIndex = points[lowIndex].group
            highGroupIndex = points[highIndex].group
            unsortedGroup[lowGroupIndex] += unsortedGroup[highGroupIndex]
            del unsortedGroup[highGroupIndex]
            for point in points:
                if point.group == highGroupIndex:
                    point.group = lowGroupIndex
        if len(unsortedGroup) <= int(len(points) * mergeRatio):
            break
    sortedGroup = sorted(unsortedGroup.iteritems(), key=lambda group: group[1], reverse=True)
    topClusterCenterCount = 0
    print sortedGroup, len(sortedGroup)
    for key, _ in sortedGroup:
        topClusterCenterCount += 1
        for point in points:
            if point.group == key:
                point.group = -1 * topClusterCenterCount
        if topClusterCenterCount >= clusterCenterNumber:
            break
    return points


def showClusterAnalysisResults(points):
    colorStore = ['or', 'og', 'ob', 'oc', 'om', 'oy', 'ok']
    pylab.figure(figsize=(9, 9), dpi = 80)
    for point in points:
        color = ''
        if point.group < 0:
            color = colorStore[-1 * point.group - 1]
        else:
            color = colorStore[-1]
        pylab.plot(point.x, point.y, color)
    pylab.show()

def main():
    clusterCenterNumber = 4
    pointsNumber = 500
    radius = 10
    mergeRatio = 0.025
    points = generatePoints(pointsNumber, radius)
    distanceMap = getDistanceMap(points)
    points = agglomerativeHierarchicalClustering(points, distanceMap, mergeRatio, clusterCenterNumber)
    showClusterAnalysisResults(points)

main()

(3) DBscan

DBscan是一种基于密度的聚类算法。因此首先应定义密度的概念。密度是以一个点为中心2*EPs边长的正方形区域内点的个数。并将不同密度的点划归为不同类型的点:

具体操作:

该算法的特点有如下几点:

dbscan

python代码

import math
import collections
import random
import copy
import pylab

try:
    import psyco
    psyco.full()
except ImportError:
    pass

FLOAT_MAX = 1e100

CORE_POINT_TYPE = -2
BOUNDARY_POINT_TYPE = 1 #ALL NONE-NEGATIVE INTEGERS CAN BE BOUNDARY POINT TYPE
OTHER_POINT_TYPE = -1

class Point:
    __slots__ = ["x", "y", "group", "pointType"]
    def __init__(self, x = 0, y = 0, group = 0, pointType = -1):
        self.x, self.y, self.group, self.pointType = x, y, group, pointType

def generatePoints(pointsNumber, radius):
    points = [Point() for _ in xrange(4 * pointsNumber)]
    originX = [-radius, -radius, radius, radius]
    originY = [-radius, radius, -radius, radius]
    count = 0
    countCenter = 0
    for index, point in enumerate(points):
        count += 1
        r = random.random() * radius
        angle = random.random() * 2 * math.pi
        point.x = r * math.cos(angle) + originX[countCenter]
        point.y = r * math.sin(angle) + originY[countCenter]
        point.group = index
        if count >= pointsNumber * (countCenter + 1):
            countCenter += 1    
    return points

def solveDistanceBetweenPoints(pointA, pointB):
    return (pointA.x - pointB.x) * (pointA.x - pointB.x) + (pointA.y - pointB.y) * (pointA.y - pointB.y)

def isInPointBoundary(centerPoint, customPoint, halfScale):
    return customPoint.x <= centerPoint.x + halfScale and customPoint.x >= centerPoint.x - halfScale and customPoint.y <= centerPoint.y + halfScale and customPoint.y >= centerPoint.y - halfScale

def getPointsNumberWithinBoundary(points, halfScale):
    pointsIndexGroupWithinBoundary = [[] for _ in xrange(len(points))]
    for centerIndex, centerPoint in enumerate(points):
        for index, customPoint in enumerate(points):
            if centerIndex != index and isInPointBoundary(centerPoint, customPoint, halfScale):
                pointsIndexGroupWithinBoundary[centerIndex].append(index)
    return pointsIndexGroupWithinBoundary

def decidePointsType(points, pointsIndexGroupWithinBoundary, minPointsNumber):
    for index, customPointsGroup in enumerate(pointsIndexGroupWithinBoundary):
        if len(customPointsGroup) >= minPointsNumber:
            points[index].pointType = CORE_POINT_TYPE
    for index, customPointsGroup in enumerate(pointsIndexGroupWithinBoundary):
        if len(customPointsGroup) < minPointsNumber:
            for customPointIndex in customPointsGroup:
                if points[customPointIndex].pointType == CORE_POINT_TYPE:
                    points[index].pointType = customPointIndex

def mergeGroup(points, fromIndex, toIndex):
    for point in points:
        if point.group == fromIndex:
            point.group = toIndex

def dbscan(points, pointsIndexGroupWithinBoundary, clusterCenterNumber):
    countGroupsNumber = {index: 1 for index in xrange(len(points))}
    for index, point in enumerate(points):
        if point.pointType == CORE_POINT_TYPE:
            for customPointIndex in pointsIndexGroupWithinBoundary[index]:
                if points[customPointIndex].pointType == CORE_POINT_TYPE and points[customPointIndex].group != point.group:
                    countGroupsNumber[point.group] += countGroupsNumber[points[customPointIndex].group]
                    del countGroupsNumber[points[customPointIndex].group]
                    mergeGroup(points, points[customPointIndex].group, point.group)
        #point.pointType >= 0 means it is BOUNDARY_POINT_TYPE
        elif point.pointType >= 0:
            corePointGroupIndex = points[point.pointType].group
            countGroupsNumber[corePointGroupIndex] += countGroupsNumber[point.group]
            del countGroupsNumber[point.group]
            point.group = corePointGroupIndex
    countGroupsNumber = sorted(countGroupsNumber.iteritems(), key=lambda group: group[1], reverse=True)
    count = 0
    for key, _ in countGroupsNumber:
        count += 1
        for point in points:
            if point.group == key:
                point.group = -1 * count
        if count >= clusterCenterNumber:
            break

def showClusterAnalysisResults(points):
    colorStore = ['or', 'og', 'ob', 'oc', 'om', 'oy', 'ok']
    pylab.figure(figsize=(9, 9), dpi = 80)
    for point in points:
        color = ''
        if point.group < 0:
            color = colorStore[-1 * point.group - 1]
        else:
            color = colorStore[-1]
        pylab.plot(point.x, point.y, color)
    pylab.show()

def main():
    clusterCenterNumber = 4
    pointsNumber = 500
    radius = 10
    Eps = 2
    minPointsNumber = 18
    points = generatePoints(pointsNumber, radius)
    pointsIndexGroupWithinBoundary = getPointsNumberWithinBoundary(points, Eps)
    decidePointsType(points, pointsIndexGroupWithinBoundary, minPointsNumber)
    dbscan(points, pointsIndexGroupWithinBoundary, clusterCenterNumber)
    showClusterAnalysisResults(points)

main()

后记

在学习和分析过程中发现几点待解决的问题:

上一篇下一篇

猜你喜欢

热点阅读