自己造轮子-CART回归和剪枝

2017-05-18  本文已影响0人  Alistair
#ID3主要用于分类,这里举一个CART回归的例子
from numpy import *

#连续特征切分后返回的切分样本
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0, mat1

def regLeaf(dataSet):#均值,用来计算y
    return mean(dataSet[:,-1])

def regErr(dataSet):#平方误差,用方差*样本数
    return var(dataSet[:,-1]) * shape(dataSet)[0]

 #对每个属性进行切分,计算最佳属性-切分点对,ops预设属性,tolS是容许的误差下降值
 #tolN是切分的最小样本数,当小于此值时不再切分,即预剪枝
def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
    tolS = ops[0]; tolN = ops[1]
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #终止条件:y都一致无需划分
           return None, leafType(dataSet) #返回特征为None,叶子节点值为leafType(dataSet)
    m,n = shape(dataSet)
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0 #设了三个初始值,将误差设为无穷大
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):#每个属性的取值,注意matrix类型不能hashable,需要先转换成array
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)#最后转换成list
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    if (S - bestS) < tolS:
        return None, leafType(dataSet)
    mat0,mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if(shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
        return None,leafType(dataSet)
    return bestIndex, bestValue

#生成树,左分支取值为“是”,右分支取值“否”
def createTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
    if feat == None:return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTre


#pruning 后剪枝
def isTree(obj):
    return (type(obj).__name__ == 'dict')

def getMean(tree):
    if isTree(tree['right']):
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']):
        tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right']) / 2.0

def prune(tree, testData):
    if shape(testData)[0] == 0: return getMean(tree) #对树进行塌陷处理
    if (isTree(tree['right']) or isTree(tree['left'])):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']):
        tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']):
        tree['right'] = prune(tree['right'], rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet =  binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'], 2)) + \
                          sum(power(rSet[:,-1] - tree['right'], 2))
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = sum(power(testData[:-1] - treeMean, 2)) #power迭代器,每个元素求次方
        if errorMerge < errorNoMerge:
            print('merging')
            return treeMean
        else:return tree
    else:return tree

#模型树,用线性组合代替树节点判断
def linearSolve(dataSet):
    m, n = shape(dataSet)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))
    X[:,1:n] = dataSet[:,0:n-1]
    Y = dataSet[:,-1]
    xTx = X.T * X
    if linalg.det(xTx) == 0.0:
        raise NameError('singular matrix')
    ws  = xTx.I * (X.T * Y)
    return ws, X, Y

def modelLeaf(dataSet):
    ws, X ,Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws, X, Y =linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat, 2))
上一篇 下一篇

猜你喜欢

热点阅读