[kaggle系列 二] 使用决策树判断是否能从泰坦尼克号生还
题目
连接:https://www.kaggle.com/c/titanic
简析
上一篇用了贝叶斯分类器,这次用决策树和随机森林试一试,不过最终的得分没有贝叶斯分类器高,好吧,说实话,感觉再用几个不同的机器学习方法应该结果也差不多,现在主要是试水,先搞懂基础的算法,然后再通过数据的处理与分析去优化结果。
决策树
我个人认为,决策树应该是比较好理解的机器学习算法了。其中心思想就是ifelse,存在很多个条件的时候,如果第一个条件是A,第二个条件是B…………就选择方案C。是一个很自然的方法,我们平常生活中也可能很常用,比如下图就是一个屌丝假日的日常决策树:
看起来是很简单的,但是要怎么应用到机器学习上,让它成为一个分类器呢?以本例中的问题来说,我知道每一个人的生还情况,还有他的各种属性(特征),我们要根据这些特征,来生成一颗决策树,最终到达叶子节点的时候,我们就知道对于某一系列的属性,最终是生还还是死亡,大概就是要生成这样一棵树(进行了简化):
中间还可以加很多其他属性,比如舱位信息之类的,最终,在筛完所有信息以后,你就可以通过当前训练数据给出一个概率,表示当前叶结点生还的概率。
接下来的问题是如何实现,选择条件的顺序是否影响结果,是不是可以随便选择条件呢?当然这里树的结构肯定会影响最终的结果,如何去构造一棵树呢?
这里需要用到一个信息熵的概念。熵这个东西应该大家都听说过,熵表明的是一个事物的混乱程度,熵越大,混乱程度越高,熵越小,表明混乱程度越低。信息熵的概念也是一样的,就是用来表明信息的混乱程度,我们选择一个树根的时候,最好的情况肯定是通过这个属性把数据分成几类以后,这些数据的熵越小越好,因为越小代表越有序,分类越清晰。那么我们要做的就是计算每个条件作为当前的根结点的信息熵,最终选一个最小的分类方法作为根节点,并以此类推,直到叶节点~
信息熵的计算公式如下:
其中,m是最终的分类结果,在本例中,就是生还与否两个类,pi是这个决策(分类)发生的概率。
随机森林
随机森林就是决策树的加强版,决策树这种方法,虽然有信息熵作为划分方法,但是实际上,如果划分到最精细的一层,那么就会出现过拟合的问题,泛化能力就比较差,往往训练数据上表现的比较好,对于新的数据,准确度就会变低。在我写的决策树代码就出现了这个问题,当时没多想,直接分到最后一层,结果准确率只有0.57。但是如果不分得更精细,准确度也不够高。可能需要进行大量的测试,才能找到一个平衡点,既不至于过拟合,也不至于欠拟合导致准确率太低。
随机森林提供了一个比较通用的解决方法,就是随机生成多个比较浅的决策树,当进行拟合的时候,让多个决策树进行投票,最终哪个分类的票高就决定是哪个类。
代码与结果
首先是决策树的代码,说实话,这代码写的比较丑,写起来不是很顺手,边学边写,逻辑搞的有点乱。最终准确率只有0.57416,这个我认为是划分过于精细导致模型过拟合了,在适当的分支进行剪枝效果可能会更好,当然,也有可能哪里写出了点小bug(笑)。
import csv
import os
import random
import math
class Node:
def __init__(self):
self.attr_name = ""
self.value_type = ""
self.classifier = None
self.childrens = []
self.entropy = 0
def getNext(self, value):
result = 0
if self.value_type == 'disperse_data':
pos = 0
if self.classifier.has_key(value):
pos = self.classifier[value]
else:
pos = random.randint(0, len(self.childrens) - 1)
result = self.childrens[pos]
elif self.value_type == 'continuity_data':
if value <= self.classifier:
result = self.childrens[0]
else:
result = self.childrens[1]
return result
def readData(fileName):
result = {}
with open(fileName,'rb') as f:
rows = csv.reader(f)
for row in rows:
if result.has_key('attr_list'):
for i in range(len(result['attr_list'])):
key = result['attr_list'][i]
if not result.has_key(key):
result[key] = []
result[key].append(row[i])
else:
result['attr_list'] = row
return result
def writeData(fileName, data):
csvFile = open(fileName, 'w')
writer = csv.writer(csvFile)
n = len(data)
for i in range(n):
writer.writerow(data[i])
csvFile.close()
def convertData(dataList):
hashTable = {}
count = 0
for i in range(len(dataList)):
if not hashTable.has_key(dataList[i]):
hashTable[dataList[i]] = count
count += 1
dataList[i] = str(hashTable[dataList[i]])
def convertValueData(dataList):
sumValue = 0.0
count = 0
for i in range(len(dataList)):
if dataList[i] == "":
continue
sumValue += float(dataList[i])
count += 1
dataList[i] = float(dataList[i])
avg = sumValue / count
for i in range(len(dataList)):
if dataList[i] == "":
dataList[i] = avg
def dataPredeal(data):
useDataList = ['Sex','Pclass', 'SibSp','Parch','Embarked']
result = {}
convertValueData(data["Age"])
result['Age'] = data['Age']
for i in range(len(useDataList)):
attrName = useDataList[i]
convertData(data[attrName])
result[attrName] = data[attrName]
return result
def calEntropy(dataList, labelList, isContinuity):
if not isContinuity:
count = 0.0
attrCount = {}
for i in range(len(dataList)):
key = dataList[i]
label = labelList[i]
count += 1
if not attrCount.has_key(key):
attrCount[key] = {'0':0.0,'1':0.0}
if not attrCount[key].has_key(label):
attrCount[key][label] = 0.0
attrCount[key][label] += 1.0
entropy = 0
for key in attrCount:
p0 = attrCount[key]['0']/(attrCount[key]['0'] + attrCount[key]['1'])
p1 = attrCount[key]['1']/(attrCount[key]['0'] + attrCount[key]['1'])
v0 = 0 if p0 == 0 else p0*math.log(p0,2)
v1 = 0 if p1 == 0 else p1*math.log(p1,2)
temp = (attrCount[key]['0'] + attrCount[key]['1']) / count * (v0 + v1)
entropy -= temp
return entropy, None
else:
ageList = set([dataList[i] for i in range(len(dataList))])
ageList = list(ageList)
ageList.sort()
minEntropy = 1
targetAge = 0
for i in range(len(ageList) - 1):
avgAge = (ageList[i] + ageList[i + 1]) / 2
count = 0.0
left_sum = {'0':0.0,'1':0.0}
right_sum = {'0':0.0,'1':0.0}
for j in range(len(dataList)):
if dataList[j] <= avgAge:
left_sum[labelList[j]] += 1.0
else:
right_sum[labelList[j]] += 1.0
count += 1.0
pl = (left_sum['0'] + left_sum['1']) / count
pl0 = left_sum['0']/(left_sum['0'] + left_sum['1'])
pl1 = 1.0 - pl0
pr = (right_sum['0'] + right_sum['1']) / count
pr0 = right_sum['0']/(right_sum['0'] + right_sum['1'])
pr1 = 1.0 - pr0
vl0 = 0 if pl0 == 0 else pl0*math.log(pl0,2)
vl1 = 0 if pl1 == 0 else pl1*math.log(pl1,2)
vr0 = 0 if pr0 == 0 else pr0*math.log(pr0,2)
vr1 = 0 if pr1 == 0 else pr1*math.log(pr1,2)
entropy = - pl*(vl0 + vl1) - pr*(vr0 + vr1)
if entropy < minEntropy:
minEntropy = entropy
targetAge = avgAge
return minEntropy, targetAge
def checkFinal(data,labelList, root):
diff_count = 0
hash_key = {}
attrName = ""
for key in data:
if not hash_key.has_key(key):
hash_key[key] = True
diff_count += 1
attrName = key
if diff_count > 1:
break
if diff_count > 1:
return False
root.attr_name = attrName
root.value_type = 'continuity_data' if attrName == 'Age' else 'disperse_data'
ageBoundary = None
if attrName == 'Age':
entropy,ageBoundary = calEntropy(data[attrName], labelList, True)
statistics = {}
for i in range(len(data[attrName])):
key = data[attrName][i]
if ageBoundary != None:
key = 0 if key <= ageBoundary else 1
if not statistics.has_key(key):
statistics[key] = [0.0,0.0]
pos = int(labelList[i])
statistics[key][pos] += 1.0
root.classifier = ageBoundary if attrName == 'Age' else {}
root.childrens = [] if attrName != 'Age' else [0,0]
count = 0
for key in statistics:
if ageBoundary == None:
if not root.classifier.has_key(key):
root.classifier[key] = count
root.childrens.append(0)
count += 1
root.childrens[root.classifier[key]] = 0 if statistics[key][0] > statistics[key][1] else 1
else:
root.childrens[key] = 0 if statistics[key][0] > statistics[key][1] else 1
return True
def deepPrint(deep, info):
s = ''
for i in range(deep):
s += ' '
s += 'deep:' + str(deep) + ' attr:' + info
print s
def buildTree(data, labelList, deep=0):
root = Node()
if checkFinal(data, labelList, root) == True:
#deepPrint(deep, root.attr_name)
return root
minEntropy = 1
targetAttrName = ''
continuityValueBoundary = None
for key in data:
entropy, targetAge = calEntropy(data[key], labelList, key == 'Age')
if entropy < minEntropy:
minEntropy = entropy
targetAttrName = key
continuityValueBoundary = targetAge
root.attr_name = targetAttrName
#deepPrint(deep, root.attr_name)
if continuityValueBoundary != None:
root.value_type = 'continuity_data'
root.classifier = continuityValueBoundary
root.childrens = [0,0]
else:
root.value_type = 'disperse_data'
root.classifier = {}
root.childrens = []
subDatas = {}
for i in range(len(data[targetAttrName])):
key = data[targetAttrName][i]
if continuityValueBoundary != None:
if key <= root.classifier:
key = 0
else:
key = 1
if not subDatas.has_key(key):
subDatas[key] = {'data':{},'labelList':[]}
for k in data:
if k != targetAttrName:
if not subDatas[key]['data'].has_key(k):
subDatas[key]['data'][k] = []
subDatas[key]['data'][k].append(data[k][i])
subDatas[key]['labelList'].append(labelList[i])
count = 0
for key in subDatas:
child = buildTree(subDatas[key]['data'], subDatas[key]['labelList'], deep+1)
if root.value_type == 'continuity_data':
root.childrens[key] = child
else:
root.classifier[key] = count
root.childrens.append(child)
count += 1
return root
def train(train_data):
x = dataPredeal(train_data)
tree = buildTree(x, train_data['Survived'])
return tree
def fit(tree, test_data, pos):
result = tree
while(result != 0 and result != 1):
result = result.getNext(test_data[result.attr_name][pos])
return [test_data['PassengerId'][pos],result]
def run():
dataRoot = '../../kaggledata/titanic/'
train_data = readData(dataRoot + 'train.csv')
test_data = readData(dataRoot + 'test.csv')
tree = train(train_data)
result_list = []
result_list.append(['PassengerId', 'Survived'])
for i in range(len(test_data['PassengerId'])):
result_list.append(fit(tree, test_data, i))
writeData(dataRoot + 'result.csv', result_list)
run()
下面的代码用了sklearn库里的随机森林的方法,还是很方便的,效果也还行,准确率有0.74641,果然三个臭皮匠干死诸葛亮~
import csv
import os
import random
import math
from sklearn.ensemble import RandomForestClassifier
def readData(fileName):
result = {}
with open(fileName,'rb') as f:
rows = csv.reader(f)
for row in rows:
if result.has_key('attr_list'):
for i in range(len(result['attr_list'])):
key = result['attr_list'][i]
if not result.has_key(key):
result[key] = []
result[key].append(row[i])
else:
result['attr_list'] = row
return result
def writeData(fileName, data):
csvFile = open(fileName, 'w')
writer = csv.writer(csvFile)
n = len(data)
for i in range(n):
writer.writerow(data[i])
csvFile.close()
def convertData(dataList):
hashTable = {}
count = 0
for i in range(len(dataList)):
if not hashTable.has_key(dataList[i]):
hashTable[dataList[i]] = count
count += 1
dataList[i] = str(hashTable[dataList[i]])
def convertValueData(dataList):
sumValue = 0.0
count = 0
for i in range(len(dataList)):
if dataList[i] == "":
continue
sumValue += float(dataList[i])
count += 1
dataList[i] = float(dataList[i])
avg = sumValue / count
for i in range(len(dataList)):
if dataList[i] == "":
dataList[i] = avg
def dataPredeal(data):
useDataList = ['Sex','Pclass', 'SibSp','Parch','Embarked']
convertValueData(data["Age"])
for i in range(len(useDataList)):
attrName = useDataList[i]
convertData(data[attrName])
def train(train_data):
dataPredeal(train_data)
useList = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
x = []
y = []
for i in range(len(train_data['Survived'])):
item = []
for j in range(len(useList)):
item.append(train_data[useList[j]][i])
x.append(item)
y.append(train_data['Survived'][i])
clf = RandomForestClassifier().fit(x,y)
return clf
def predict(clf, test_data, pos):
x = [[]]
useList = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
for i in range(len(useList)):
x[0].append(test_data[useList[i]][pos])
result = clf.predict(x)
return [test_data['PassengerId'][pos],int(result[0])]
def run():
dataRoot = '../../kaggledata/titanic/'
train_data = readData(dataRoot + 'train.csv')
test_data = readData(dataRoot + 'test.csv')
clf = train(train_data)
dataPredeal(test_data)
result_list = []
result_list.append(['PassengerId', 'Survived'])
for i in range(len(test_data['PassengerId'])):
result_list.append(predict(clf, test_data, i))
print 'cal:' + str(i)
writeData(dataRoot + 'result.csv', result_list)
run()