我的第一个Kaggle竞赛

2017-09-22  本文已影响96人  平平又无奇

本文根据大数据竞赛平台——Kaggle 入门篇的代码修改而来

import csv
from array import array
from numpy import *
import scipy.io
def loadTrainData():  
    l=[]  
    with open('train.csv') as file:  
        lines=csv.reader(file)  
        for line in lines:  
            l.append(line) #42001*785  
    l.remove(l[0]) 
    l=array(l) 
    label=l[:,0]  
    data=l[:,1:]  
    return nomalizing(toInt(data)),toInt(label) 
#toInt()函数,是将字符串转换为整数
def toInt(array):  
    array = mat(array)  
    m,n = shape(array)  
    newArray=zeros((m,n))  
    for i in range(m):  
        for j in range(n):  
                newArray[i,j]=int(array[i,j])  
    return newArray  
def nomalizing(array):  
    m,n=shape(array)  
    for i in range(m):  
        for j in range(n):  
            if array[i,j]!=0:  
                array[i,j]=1  
    return array  
def loadTestData():  
    l=[]  
    with open('test.csv') as file:  
        lines=csv.reader(file)  
        for line in lines:  
            l.append(line)  
    #28001*784  
    l.remove(l[0])  
    data = array(l)  
    return nomalizing(toInt(data)) 
def classify(inX, dataSet, labels, k):  
    inX = mat(inX)  
    dataSet = mat(dataSet)  
    labels = mat(labels)  
    dataSetSize = dataSet.shape[0]                    
    diffMat = tile(inX, (dataSetSize,1)) - dataSet     
    sqDiffMat = array(diffMat)**2  
    sqDistances = sqDiffMat.sum(axis=1)                    
    distances = sqDistances**0.5  
    sortedDistIndicies = distances.argsort()              
    classCount={}                                        
    for i in range(k):  
        voteIlabel = labels[0,sortedDistIndicies[i]]  
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1  
    sortedClassCount = sorted(classCount.items(), key=lambda d:d[1], reverse=True)  
    return sortedClassCount[0][0]  

python中csv模块写文件时总是空一行的解决办法1办法2办法3办法4

def saveResult(result):  
    with open('result.csv','w',newline='') as myFile:    #加newline=''  
        myWriter = csv.writer(myFile)  
        for i in result:  
            tmp=[]  
            tmp.append(i)  
            myWriter.writerow(tmp)  
def handwritingClassTest():  
    trainData,trainLabel=loadTrainData()  
    testData=loadTestData()  
    m,n=shape(testData)  
    resultList=[]  
    for i in range(m):  
        classifierResult = classify(testData[i], trainData, trainLabel, 5)  
        resultList.append(classifierResult)  
    saveResult(resultList)  
handwritingClassTest()

去除result文件中的空行

def delblankline(infile, outfile):
    """ Delete blanklines of infile """
    infp = open(infile, "r")
    outfp = open(outfile, "w")
    lines = infp.readlines()
    for li in lines:
        if li.split():
            outfp.writelines(li)
    infp.close()
    outfp.close()
#调用示例
if __name__ == "__main__":
    delblankline("result.csv","ok.csv")

根据Kaggle上的sample_submission.csv文件的格式修改我们得到的预测值文件,并上传到Kaggle上,最终准确率为96.399%:

kaggle手写数字准确率
上一篇 下一篇

猜你喜欢

热点阅读