机器学习笔记-文本分类（四）代码实现

2017-05-14 本文已影响4740人 sf705

在学习文本分类的时候发现主要有以下几个步骤，借助代码说明（代码大多参考：机器学习算法原理与编程实战，不过发现给的语料有些编码问题，并且本人用的是Python3.6+windows，所有进行了部分修改）。

主要步骤

将训练集中的所有文本用jieba分词保存到另外一个文件
统计分词后文本的TF-IDF，转化为词频向量
去掉停用词
应用sklearn分类

文档分词

这个是一层层进入文件，然后将结果又一层层保存到文件，原文档如下结构

Paste_Image.png

需要分词的文档是这种三级结构，分词后也是的到对应的三级结构。

统计文本的TF-IDF，这用到sklearn中的函数，直接见代码注释，我发现在pycharm中调试代码，就能很好理解这些sklearn中的函数了，一目了然

Paste_Image.png

测试集

文中的测试集是每个种类抽取十来个文本，当然标签页带上，上图的“环境200”，“计算机200” 就是类别标签（数字指里面的文档数，我直接带上了）。如图，测试集（我手动添加的测试集），然后搞了掉小动作，故意将一篇文档放错到别的类别文件夹中。

Paste_Image.png

代码实现

代码能全部执行，环境是Python3，需要安装sklearn（前面介绍了安装），还需要更改文件路径

import jieba
import os
import pickle   #持久化
from numpy import *
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer #TF_IDF向量生成类
from sklearn.datasets.base import Bunch
from sklearn.naive_bayes import MultinomialNB #多项式贝叶斯算法

def readFile(path):
    with open(path,'r',errors='ignore') as file: #文档中编码有些问题，所有用errors过滤错误
        content = file.read()
        return content

def saveFile(path,result):
    with open(path,'w',errors='ignore') as file:
        file.write(result)

def segText(inputPath,resultPath):
    fatherLists = os.listdir(inputPath) #主目录
    for eachDir in fatherLists: #遍历主目录中各个文件夹
        eachPath = inputPath + eachDir + "/" #保存主目录中每个文件夹目录，便于遍历二级文件
        each_resultPath = resultPath + eachDir + "/"#分词结果文件存入的目录
        if not os.path.exists(each_resultPath):
            os.makedirs(each_resultPath)
        childLists = os.listdir(eachPath) #获取每个文件夹中的各个文件
        for eachFile in  childLists: #遍历每个文件夹中的子文件
            eachPathFile = eachPath + eachFile #获得每个文件路径
            print(eachFile)
            content = readFile(eachPathFile)#调用上面函数读取内容
            #content = str(content)
            result = (str(content)).replace("\r\n","").strip()#删除多余空行与空格
            #result = content.replace("\r\n","").strip()

            cutResult = jieba.cut(result)#默认方式分词，分词结果用空格隔开
            saveFile(each_resultPath+eachFile," ".join(cutResult))#调用上面函数保存文件
            
def bunchSave(inputFile,outputFile):
    catelist = os.listdir(inputFile)
    bunch = Bunch(target_name=[],label=[],filenames=[],contents=[])
    bunch.target_name.extend(catelist)#将类别保存到Bunch对象中
    for eachDir in catelist:
        eachPath = inputFile + eachDir + "/"
        fileList = os.listdir(eachPath)
        for eachFile in fileList:#二级目录中的每个子文件
            fullName = eachPath + eachFile #二级目录子文件全路径
            bunch.label.append(eachDir)#当前分类标签
            bunch.filenames.append(fullName) #保存当前文件的路径
            bunch.contents.append(readFile(fullName).strip()) #保存文件词向量
    with open(outputFile,'wb') as file_obj: #持久化必须用二进制访问模式打开
        pickle.dump(bunch,file_obj)

def readBunch(path):
    with open(path,'rb') as file:
        bunch = pickle.load(file)
    return bunch

def writeBunch(path,bunchFile):
    with open(path,'wb') as file:
        pickle.dump(bunchFile,file)

def getStopWord(inputFile):
    stopWordList = readFile(inputFile).splitlines()
    return stopWordList

def getTFIDFMat(inputPath,stopWordList,outputPath):#求得TF-IDF向量
    bunch = readBunch(inputPath)
    tfidfspace = Bunch(target_name = bunch.target_name,label=bunch.label,filenames= bunch.filenames,tdm=[],vocabulary={})
    #初始化向量空间
    vectorizer = TfidfVectorizer(stop_words=stopWordList,sublinear_tf=True,max_df=0.5)
    transformer = TfidfTransformer() #该类会统计每个词语的TF-IDF权值
    #文本转化为词频矩阵，单独保存字典文件
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
    writeBunch(outputPath,tfidfspace)

def getTestSpace(testSetPath,trainSpacePath,stopWordList,testSpacePath):
    bunch = readBunch(testSetPath)
    #构建测试集TF-IDF向量空间
    testSpace = Bunch(target_name = bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
    #导入训练集的词袋
    trainbunch = readBunch(trainSpacePath)
    #使用TfidfVectorizer初始化向量空间模型  使用训练集词袋向量
    vectorizer = TfidfVectorizer(stop_words=stopWordList,sublinear_tf=True,max_df=0.5,vocabulary=trainbunch.vocabulary)
    transformer = TfidfTransformer()
    testSpace.tdm = vectorizer.fit_transform(bunch.contents)
    testSpace.vocabulary = trainbunch.vocabulary
    #持久化
    writeBunch(testSpacePath,testSpace)

def bayesAlgorithm(trainPath,testPath):
    trainSet = readBunch(trainPath)
    testSet = readBunch(testPath)
    clf = MultinomialNB(alpha=0.001).fit(trainSet.tdm,trainSet.label)
    print(shape(trainSet.tdm))
    print(shape(testSet.tdm))
    predicted = clf.predict(testSet.tdm)
    total = len(predicted)
    rate=0
    for flabel,fileName,expct_cate in zip(testSet.label,testSet.filenames,predicted):
        if flabel != expct_cate:
            rate +=1
            print(fileName,":实际类别：",flabel,"-->预测类别：",expct_cate)
    print("erroe rate:",float(rate)*100/float(total),"%")


#分词，第一个是分词输入，第二个参数是结果保存的路径
segText("E:/Train_Data/文本分类语料库/","E:/Train_Data/segResult/")
bunchSave("E:/Train_Data/segResult/","E:/Train_Data/train_set.dat")#输入分词，输出分词向量
stopWordList = getStopWord("E:/Train_Data/各种停用词表/哈工大停用词表.txt")#获取停用词
getTFIDFMat("E:/Train_Data/train_set.dat",stopWordList,"E:/Train_Data/tfidfspace.dat")#输入词向量，输出特征空间

#训练集
segText("E:/Train_Data/test_data/","E:/Train_Data/test_segResult/")#分词
bunchSave("E:/Train_Data/test_segResult/","E:/Train_Data/test_set.dat")
getTestSpace("E:/Train_Data/test_set.dat","E:/Train_Data/tfidfspace.dat",stopWordList,"E:/Train_Data/testspace.dat")
bayesAlgorithm("E:/Train_Data/tfidfspace.dat","E:/Train_Data/testspace.dat")

机器学习笔记-文本分类（四）代码实现

主要步骤

文档分词

统计文本的TF-IDF，这用到sklearn中的函数，直接见代码注释，我发现在pycharm中调试代码，就能很好理解这些sklearn中的函数了，一目了然

测试集

代码实现

猜你喜欢

热点阅读