机器学习

机器学习实战第3章-决策树

2017-03-08  本文已影响66人  异想派

1、trees.py

import sys 
reload(sys) 
sys.setdefaultencoding("utf-8") 
import operator

from math import log
def calcshannonent(dataset):
    numentries=len(dataset)
    labelcounts={}
    for featvec in dataset:
        currentlabel=featvec[-1]
        if currentlabel not in labelcounts.keys():
            labelcounts[currentlabel]=0
        labelcounts[currentlabel]+=1
        shannonent=0
    for key in labelcounts:
        prob=float(labelcounts[key])/numentries
        shannonent-=prob*log(prob,2)
    return shannonent    

def createdataset():
    dataset=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
    labels=['no surfacing','flippers']
    return dataset,labels

def splitdataset(dataset,axis,value):
    retdataset=[]
    for featvec in dataset:
        if featvec[axis]==value:
            reducedfeatvec=featvec[:axis]
            reducedfeatvec.extend(featvec[axis+1:])
            retdataset.append(reducedfeatvec)
    return retdataset

def choosebestfeaturetosplit(dataset):
    numberfeature=len(dataset[0])-1
    baseentropy=calcshannonent(dataset)
    bestinfogain=0.0;bestfeature=-1
    for i in range(numberfeature):
        featlist=[example[i] for example in dataset]
        uniquevals=set(featlist)
        newentropy=0.0
        for value in uniquevals:
            subdataset=splitdataset(dataset,i,value)
            prob=len(subdataset)/float(len(dataset))
            newentropy+=prob*calcshannonent(subdataset)
        infogain=baseentropy-newentropy
        if infogain>bestinfogain:
            bestinfogain=infogain
            bestfeature=i
    return bestfeature

def majoritycnt(classlist):
    classcount={}
    for vote in classlist:
        if vote not in classcount.keys:
            classcount[vote]=0
        classcount[vote]+=1
    sortedclasscount=sorted(classcount.iteritems(),\
        key=operator.itemgetter(1),reverse=True)
    return sortedclasscount[0][0]

def createtree(dataset,labels):
    classlist=[example[-1] for example in dataset]
    if classlist.count(classlist[0])==len(classlist):
        return classlist[0]
    if len(dataset[0])==1:
        return majoritycnt
    bestfeat=choosebestfeaturetosplit(dataset)
    bestfeaturelabel=labels[bestfeat]
    mytree={bestfeaturelabel:{}}
    del (labels[bestfeat])
    featvalues=[example[bestfeat] for example in dataset]
    uniquevals=set(featvalues)
    for value in uniquevals:
        sublabels=labels[:]
        mytree[bestfeaturelabel][value]=createtree(splitdataset(dataset,bestfeat,value),sublabels)
    return mytree

if __name__=='__main__':
    mydat,labels=createdataset()
    #mydat[0][-1]='maybe'
    #print calcshannonent(mydat)
    #print choosebestfeaturetosplit(mydat)  
    mytree=createtree(mydat,labels)
    print mytree

上一篇下一篇

猜你喜欢

热点阅读