Python: Word(docx)文档词频统计

2019-07-22  本文已影响0人  autumn1919
#该程序读入D:/data_temp下的所有docx文件,并实现词频统计
#输出每个文档的单词频数,并进行绘图
#docx 
import os 
import docx
from pyecharts.charts import Bar
from pyecharts import options as opts
words=['security','as','nation','百度','law']

def getFileName(path):
        filename = []
        f_list = os.listdir(path)
        for i in f_list:
            if os.path.splitext(i)[1] == '.docx':
                filename.append(i)
        return filename
lsdir=os.listdir('d:/data_temp')

for file in lsdir:
    data = []
    #print(getFileName('d:/'))
    doc1=r'd:/data_temp/%s'%file
    document=docx.Document(doc1)
    #print(document.paragraphs[0].text)
    for i in range(len(document.paragraphs)):
        para=document.paragraphs[i].text.replace('\r',' ').replace('\n',' ').replace('(',' ').replace(')',' ').replace(',',' ').replace('.',' ').strip().lower().split(' ')
        data.extend(para)
    #print(data)
    new_dict = {}
    for strs in data:
        if strs in new_dict.keys():
            new_dict[strs] = new_dict[strs]+1
        else:
            new_dict[strs] = 1
#count_list=sorted(new_dict.items(),key=lambda x:x[1],reverse=True)
    plot_name=[]
    plot_value=[]
    lists=[]
    for k in words:
        if k in new_dict:
            plot_name.append(k)
            plot_value.append(new_dict[k])
            print("%s"%file+" 单词 "+"%s"%k+" 的出现频数为 "+"%s"%new_dict[k]+" 次")
        else:
            print("%s"%file+" 单词 "+"%s"%k+" 未出现!")
            plot_name.append(k)
            plot_value.append(0)
    bar=Bar()
    bar.add_xaxis(plot_name)
    bar.add_yaxis("词语出现次数", plot_value)
    #bar.add("词语出现次数", plot_name,plot_value,is_label_show=True, is_datazoom_show=False, xaxis_rotate=30)
    bar.set_global_opts(title_opts=opts.TitleOpts(title="词频统计"))
    file_abb=file.replace('\.docx','')
    name="%s"%file_abb+"-"+"汇总词频统计" 
    bar.render('%s.html'%name)
    for k in range(len(plot_name)):
        lists.append([plot_name[k],plot_value[k]])
    with open('%s.txt'%name,'w') as f:
        f.write('词语,频数'+'\n')
        for i in lists:
            i=str(i).strip('[').strip(']').replace('\'','')
            #print(i)
            f.write(i+'\n')
上一篇下一篇

猜你喜欢

热点阅读