决策树对西瓜数据集2.0二分类

2017-04-07 本文已影响1971人异想派

西瓜数据集.jpg

@生成分类字典

# -*- coding: UTF-8 -*- 

#设置默认编码，否则中文会乱码
import sys 
reload(sys) 
sys.setdefaultencoding('utf-8') 
from math import log

#1、获取样例集和属性列表
def filetodataset(filename):   
    fr=open(filename,'r')
    all_lines=fr.readlines()   #list形式,每行为1个str
    featname=all_lines[0].strip().split(',')  #list形式
    featname=featname[:-1]
    dictcategory={}
    dataset=[]
    for sample in all_lines[1:]:
        sample=sample.strip().split(',')   #以逗号为分割符拆分列表
        dataset.append(sample)
    return dataset,featname

#2、计算香农商
def calcent(dataset):
    dictcategory={}
    for i in dataset:
        category=i[-1]
        if category not in dictcategory:
            dictcategory[category]=0
        dictcategory[category]+=1
    num=len(dataset)
    shannon=0
    for i in dictcategory:
        prob=float(dictcategory[i])/num
        shannon-=prob*log(prob,2)
    return shannon

#3、对特定属性选择特定取值后，将满足该条件的剩余数据集组合留待计算香农商
def splitdataset(dataset,axis,value):
    subdataset=[]
    for sample in dataset:
        if sample[axis]==value:
            reducedfeatvec=sample[:axis]
            reducedfeatvec.extend(sample[axis+1:])
            subdataset.append(reducedfeatvec)
    return subdataset

#4、选择最佳的划分属性
def choosebestfeaturetosplit(dataset):
    attrnum=len(dataset[0])     #计算属性个数
    baseshannon=calcent(dataset) #计算整个样本集的香农商
    bestinfogain=0.0 ; bestfeature=-1
    for i in range(attrnum-1):
        featlist=[example[i] for example in dataset]  #取出特定属性的所有值。dataset包含了类，但不影响，因为取不到
        unifeat=set(featlist)   #每个属性所含的值
        attrshannon=0
        for value in unifeat:
            subdataset=splitdataset(dataset,i,value)
            shannon=calcent(subdataset)  #每个属性值取每个值的香农商
            prob=len(subdataset)/float(len(dataset))
            attrshannon+=prob*shannon
        infogain=baseshannon-attrshannon
        if infogain>bestinfogain:
            bestinfogain=infogain
            bestfeature=i
    return bestfeature


#5、返回样例中类最多的那个类别
def majorclass(data):
    aa=[sample[-1] for sample in data]   #获取每个样例最后的类别
    bb={}
    for i in aa:
        bb[i]=aa.count(i)
    #将字典bb降序排列，书中用的另一种方式
    bb= sorted(bb.iteritems(), key=lambda d:d[1], reverse = True)
    return bb


#6、生成决策树
def createtree(mydata,labels):  #labels为属性标签
    #情况1、当所有样例的类别一致时，返回类别
    samplelabel=[sample[-1] for sample in mydata]
    usamplelabel=list(set(samplelabel))
    if len(usamplelabel)==1:
        return usamplelabel[0]

    #情况2、当属性已经用完，则选择类别最多的显示
    if len(mydata[0])==1:
        return majorclass(mydata)

    #情况3：选择最佳划分属性进行划分
    bestfeature=choosebestfeaturetosplit(mydata)
    bestfeaturelabel=labels[bestfeature]
    mytree={bestfeaturelabel:{}}
    del labels[bestfeature]

    featurevalue=[sample[bestfeature] for sample in mydata]
    ufeaturevalue=set(featurevalue)
    for value in ufeaturevalue:
        sublabels=labels[:]
        mytree[bestfeaturelabel][value]=createtree(splitdataset(mydata,bestfeature,value),sublabels)
    return mytree


if __name__=='__main__':
    import json
    filename='/Users/enniu/Desktop/jqxx/xiguaset.txt'
    mydata,featname=filetodataset(filename)
    #shannon=calcent(mydata)
    #choosebestfeaturetosplit(mydata)
    mytree=createtree(mydata,featname)
    print json.dumps(mytree, ensure_ascii=False)   #直接打印字典，里面含有中文，控制台信息输出窗口按照ascii编码输出utf8编码的字符串。

结果如下：

{"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
说明
1、在结点上下游（递归）属性只出现一次，因为后面算法会剔除掉。同个属性可能出现在不同分叉路

2、与机器学习书相比P78，少了个色泽浅白为好瓜的判断

参考：
如何实现并应用决策树算法？
python 字典中有中文写入文件后变成编码

@绘制树形图


# -*- coding:utf-8 -*-

import sys 
reload(sys) 
sys.setdefaultencoding('utf-8')
import matplotlib.pyplot as plt
import json
#mytree={"纹理": {"模糊": "否", "清晰": {"根蒂": {"稍蜷": {"色泽": {"乌黑": {"触感": {"软粘": "否", "硬滑": "是"}}, "青绿": "是"}}, "蜷缩": "是", "硬挺": "否"}}, "稍糊": {"触感": {"软粘": "是", "硬滑": "否"}}}}
anothertree={'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
#anothertree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}
#print json.dumps(mytree,ensure_ascii=False)

#计算叶节点数目
def calculateleaf(mytree):
    numleaf=0
    firststr=mytree.keys()[0]  #获取字典第一个键值
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        if type(seconddict[key]).__name__=='dict':
            numleaf+= calculateleaf(seconddict[key])
        else:
            numleaf+=1
    return numleaf

#计算数的层数
def calculatedepth(mytree):
    maxdepth=0
    firststr=mytree.keys()[0]
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        #print key,
        if type(seconddict[key]).__name__=='dict':
            numdepth=1+calculatedepth(seconddict[key])
        else:
            numdepth=1   #到叶节点后，计算树深度的变量+1
        if numdepth>maxdepth:
            maxdepth=numdepth
        #print numdepth,maxdepth
    return maxdepth

def plotmidtext(cntrpt,parentpt,txtstring):
    xmid=(parentpt[0]-cntrpt[0])/2.0+cntrpt[0]
    ymid=(parentpt[1]-cntrpt[1])/2.0+cntrpt[1]
    createplot.ax1.text(xmid,ymid,txtstring)

decisionnode=dict(boxstyle="sawtooth",fc="0.8")
leafnode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="<-")

def plotnode(nodetext,centerpt,parentpt,nodetype):
    createplot.ax1.annotate(nodetext,xy=parentpt,xytext=centerpt,arrowprops=arrow_args,\
        xycoords='axes fraction',va='center',ha='center',bbox=nodetype)


def plottree(mytree,parentpt,nodetxt):
    numleafs=calculateleaf(mytree)
    depth=calculatedepth(mytree)
    firststr=mytree.keys()[0]
    cntrpt=(plottree.xoff+(1.0+float(numleafs))/2.0/plottree.totalw,plottree.yoff)
    print '子节点坐标:',cntrpt
    plotmidtext(cntrpt,parentpt,nodetxt)  #自定义函数
    plotnode(firststr,cntrpt,parentpt,decisionnode) #刚开始根节点与子节点是连在一起的？
    print '绘制连接箭头',cntrpt,parentpt
    seconddict=mytree[firststr]
    plottree.yoff=plottree.yoff-1.0/(1.0*plottree.totald) #控制宽度
    print 'y轴值:',plottree.yoff
    for key in seconddict.keys():
        if type(seconddict[key]).__name__=='dict':
            print '***sandy***',plottree.xoff  #经过else的判断后已变为1/6
            plottree(seconddict[key],cntrpt,str(key))
            print '***lam***',plottree.xoff
        else:
            plottree.xoff=plottree.xoff+1.0/plottree.totalw
            plotnode(seconddict[key],(plottree.xoff,plottree.yoff),cntrpt,leafnode)
            print '灯灯hoho',(plottree.xoff,plottree.yoff),cntrpt
            plotmidtext((plottree.xoff,plottree.yoff),cntrpt,str(key))
    #plottree.yoff=plottree.yoff+1.0/plottree.totald

def createplot(intree):
    fig=plt.figure(1,facecolor='white')
    fig.clf()
    axprops=dict(xticks=[0,0.2,0.4,0.6,0.8,1],yticks=[0,0.2,0.4,0.6,0.8,1])
    createplot.ax1=plt.subplot(111,frameon=True,**axprops)  #把**axprops去掉亦可，默认显示刻度
    plottree.totalw=float(calculateleaf(intree))
    plottree.totald=float(calculatedepth(intree))
    plottree.xoff=-0.5/plottree.totalw
    plottree.yoff=1.0
    plottree(intree,(0.5,1.0),'')
    plt.show()

if __name__=='__main__':
    createplot(anothertree)

@@递归探讨

当碰到递归时，沿着递归执行到最终结果（即最后停止递归的地方），然后再依次往上层执行

# -*- coding: UTF-8 -*- 
def calculatedepth(mytree):
    maxdepth=0
    firststr=mytree.keys()[0]
    seconddict=mytree[firststr]
    for key in seconddict.keys():
        print key
        if type(seconddict[key]).__name__=='dict':
            print '**'
            numdepth=1+calculatedepth(seconddict[key])
            print '第1种情况',numdepth
        else:
            numdepth=1   #到叶节点后，计算树深度的变量+1
            print '第2种情况',numdepth
        if numdepth>maxdepth:
            maxdepth=numdepth
        print (numdepth,maxdepth)
    return maxdepth

mytree={'no surfacing': {1: {'flippers': {0: 'no', 1: 'yes'}},0: 'no'}}

if __name__=='__main__':
    a=calculatedepth(mytree)

隐形眼镜数据集.png

决策树对西瓜数据集2.0二分类

猜你喜欢

热点阅读