机器学习笔记-文本分类(四)代码实现

2017-05-14  本文已影响4740人  sf705

在学习文本分类的时候发现主要有以下几个步骤,借助代码说明(代码大多参考:机器学习算法原理与编程实战,不过发现给的语料有些编码问题,并且本人用的是Python3.6+windows,所有进行了部分修改)。

主要步骤

文档分词

这个是一层层进入文件,然后将结果又一层层保存到文件,原文档如下结构


Paste_Image.png

需要分词的文档是这种三级结构,分词后也是的到对应的三级结构。

统计文本的TF-IDF,这用到sklearn中的函数,直接见代码注释,我发现在pycharm中调试代码,就能很好理解这些sklearn中的函数了,一目了然

Paste_Image.png
Paste_Image.png

测试集

文中的测试集是每个种类抽取十来个文本,当然标签页带上,上图的“环境200”,“计算机200” 就是类别标签(数字指里面的文档数,我直接带上了)。如图,测试集(我手动添加的测试集),然后搞了掉小动作,故意将一篇文档放错到别的类别文件夹中。

Paste_Image.png

代码实现

代码能全部执行,环境是Python3,需要安装sklearn(前面介绍了安装),还需要更改文件路径

import jieba
import os
import pickle   #持久化
from numpy import *
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer #TF_IDF向量生成类
from sklearn.datasets.base import Bunch
from sklearn.naive_bayes import MultinomialNB #多项式贝叶斯算法

def readFile(path):
    with open(path,'r',errors='ignore') as file: #文档中编码有些问题,所有用errors过滤错误
        content = file.read()
        return content

def saveFile(path,result):
    with open(path,'w',errors='ignore') as file:
        file.write(result)

def segText(inputPath,resultPath):
    fatherLists = os.listdir(inputPath) #主目录
    for eachDir in fatherLists: #遍历主目录中各个文件夹
        eachPath = inputPath + eachDir + "/" #保存主目录中每个文件夹目录,便于遍历二级文件
        each_resultPath = resultPath + eachDir + "/"#分词结果文件存入的目录
        if not os.path.exists(each_resultPath):
            os.makedirs(each_resultPath)
        childLists = os.listdir(eachPath) #获取每个文件夹中的各个文件
        for eachFile in  childLists: #遍历每个文件夹中的子文件
            eachPathFile = eachPath + eachFile #获得每个文件路径
            print(eachFile)
            content = readFile(eachPathFile)#调用上面函数读取内容
            #content = str(content)
            result = (str(content)).replace("\r\n","").strip()#删除多余空行与空格
            #result = content.replace("\r\n","").strip()

            cutResult = jieba.cut(result)#默认方式分词,分词结果用空格隔开
            saveFile(each_resultPath+eachFile," ".join(cutResult))#调用上面函数保存文件
            
def bunchSave(inputFile,outputFile):
    catelist = os.listdir(inputFile)
    bunch = Bunch(target_name=[],label=[],filenames=[],contents=[])
    bunch.target_name.extend(catelist)#将类别保存到Bunch对象中
    for eachDir in catelist:
        eachPath = inputFile + eachDir + "/"
        fileList = os.listdir(eachPath)
        for eachFile in fileList:#二级目录中的每个子文件
            fullName = eachPath + eachFile #二级目录子文件全路径
            bunch.label.append(eachDir)#当前分类标签
            bunch.filenames.append(fullName) #保存当前文件的路径
            bunch.contents.append(readFile(fullName).strip()) #保存文件词向量
    with open(outputFile,'wb') as file_obj: #持久化必须用二进制访问模式打开
        pickle.dump(bunch,file_obj)

def readBunch(path):
    with open(path,'rb') as file:
        bunch = pickle.load(file)
    return bunch

def writeBunch(path,bunchFile):
    with open(path,'wb') as file:
        pickle.dump(bunchFile,file)

def getStopWord(inputFile):
    stopWordList = readFile(inputFile).splitlines()
    return stopWordList

def getTFIDFMat(inputPath,stopWordList,outputPath):#求得TF-IDF向量
    bunch = readBunch(inputPath)
    tfidfspace = Bunch(target_name = bunch.target_name,label=bunch.label,filenames= bunch.filenames,tdm=[],vocabulary={})
    #初始化向量空间
    vectorizer = TfidfVectorizer(stop_words=stopWordList,sublinear_tf=True,max_df=0.5)
    transformer = TfidfTransformer() #该类会统计每个词语的TF-IDF权值
    #文本转化为词频矩阵,单独保存字典文件
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
    writeBunch(outputPath,tfidfspace)

def getTestSpace(testSetPath,trainSpacePath,stopWordList,testSpacePath):
    bunch = readBunch(testSetPath)
    #构建测试集TF-IDF向量空间
    testSpace = Bunch(target_name = bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
    #导入训练集的词袋
    trainbunch = readBunch(trainSpacePath)
    #使用TfidfVectorizer初始化向量空间模型  使用训练集词袋向量
    vectorizer = TfidfVectorizer(stop_words=stopWordList,sublinear_tf=True,max_df=0.5,vocabulary=trainbunch.vocabulary)
    transformer = TfidfTransformer()
    testSpace.tdm = vectorizer.fit_transform(bunch.contents)
    testSpace.vocabulary = trainbunch.vocabulary
    #持久化
    writeBunch(testSpacePath,testSpace)

def bayesAlgorithm(trainPath,testPath):
    trainSet = readBunch(trainPath)
    testSet = readBunch(testPath)
    clf = MultinomialNB(alpha=0.001).fit(trainSet.tdm,trainSet.label)
    print(shape(trainSet.tdm))
    print(shape(testSet.tdm))
    predicted = clf.predict(testSet.tdm)
    total = len(predicted)
    rate=0
    for flabel,fileName,expct_cate in zip(testSet.label,testSet.filenames,predicted):
        if flabel != expct_cate:
            rate +=1
            print(fileName,":实际类别:",flabel,"-->预测类别:",expct_cate)
    print("erroe rate:",float(rate)*100/float(total),"%")


#分词,第一个是分词输入,第二个参数是结果保存的路径
segText("E:/Train_Data/文本分类语料库/","E:/Train_Data/segResult/")
bunchSave("E:/Train_Data/segResult/","E:/Train_Data/train_set.dat")#输入分词,输出分词向量
stopWordList = getStopWord("E:/Train_Data/各种停用词表/哈工大停用词表.txt")#获取停用词
getTFIDFMat("E:/Train_Data/train_set.dat",stopWordList,"E:/Train_Data/tfidfspace.dat")#输入词向量,输出特征空间

#训练集
segText("E:/Train_Data/test_data/","E:/Train_Data/test_segResult/")#分词
bunchSave("E:/Train_Data/test_segResult/","E:/Train_Data/test_set.dat")
getTestSpace("E:/Train_Data/test_set.dat","E:/Train_Data/tfidfspace.dat",stopWordList,"E:/Train_Data/testspace.dat")
bayesAlgorithm("E:/Train_Data/tfidfspace.dat","E:/Train_Data/testspace.dat")
上一篇下一篇

猜你喜欢

热点阅读