大数据,机器学习,人工智能人工智能/模式识别/机器学习精华专题机器学习和人工智能入门

朴素贝叶斯

2019-05-19  本文已影响0人  山雾幻华

朴素贝叶斯:基于概率论的分类方法

import numpy as np
def creatVocabList(dataSet):
    """
    创建词汇表
    params  dataSet:二维列表
    returns list(vocabset):词汇表
    """
    vocabset = set([])
    for document in dataSet:
        vocabset = vocabset | set(document)
    return list(vocabset)

def setOfWords2Vec(vocabList, inputSet):
    """
    文本转变为词向量,词集模型
    params vocabList:词汇表
    params inputSet:需要转化的单维列表
    returns returnVec:转化为数字的单维列表
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: 
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def bagOfWords2VecMN(vocabList, inputSet):
    """
    文本转变为词向量,词袋模型
    params vocabList:词汇表
    params inputSet:需要转化的单维列表
    returns returnVec:转化为数字的单维列表
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def trainNB0(trainMat, trainCategory):
    """
    计算概率
    params trainMat:样本或者转化过的字典(二维列表)
    parmas trainCategory:trainMat所对应的标签
    returns p0Vect:在p0下的各个词的概率
    returns p1Vect:在p1下的各个词的概率
    """
    # trainMatrix=[]
    # for postinDoc in postingList:
    #     trainMatrix.append(setOfWords2Vec(myVocalList, postinDoc))  # 使用词集模型计算
    
    trainMatrix = trainMat
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)  #change to np.ones()
    p0Denom = 2.0
    p1Denom = 2.0  #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num / p1Denom)  #change to np.log()
    p0Vect = np.log(p0Num / p0Denom)  #change to np.log()
    return p0Vect, p1Vect, pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    """
    计算朴素贝叶斯分类
    params vec2Classify:转化为词向量的需要测试的单维列表
    params p0Vec: trainNB0()的p0Vect输出
    params p1Vec: trainNB0()的p1Vect输出
    params pClass1: trainNB0()的pClass1输出
    returns 1/0: 分类标签
    """
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB(testEntry, listOPosts, listClasses):
    """
    测试函数
    params testEntry: 测试的一维数据列表
    params listOPosts: 数据集二维列表
    params listClasses: 数据集标签
    returns classified: 测试数据集的分类
    """
    myVocabList = creatVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
        # trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    # testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    classified = classifyNB(thisDoc, p0V, p1V, pAb)
    return classified

参考

[1]作者:Jack-Cui
来源:CSDN
原文:https://blog.csdn.net/c406495762/article/details/77500679
版权声明:本文为博主原创文章,转载请附上博文链接!
[2]机器学习实战
[3]统计学习方法

上一篇下一篇

猜你喜欢

热点阅读