Python: Word(docx)文档词频统计
2019-07-22 本文已影响0人
autumn1919
#该程序读入D:/data_temp下的所有docx文件,并实现词频统计
#输出每个文档的单词频数,并进行绘图
#docx
import os
import docx
from pyecharts.charts import Bar
from pyecharts import options as opts
words=['security','as','nation','百度','law']
def getFileName(path):
filename = []
f_list = os.listdir(path)
for i in f_list:
if os.path.splitext(i)[1] == '.docx':
filename.append(i)
return filename
lsdir=os.listdir('d:/data_temp')
for file in lsdir:
data = []
#print(getFileName('d:/'))
doc1=r'd:/data_temp/%s'%file
document=docx.Document(doc1)
#print(document.paragraphs[0].text)
for i in range(len(document.paragraphs)):
para=document.paragraphs[i].text.replace('\r',' ').replace('\n',' ').replace('(',' ').replace(')',' ').replace(',',' ').replace('.',' ').strip().lower().split(' ')
data.extend(para)
#print(data)
new_dict = {}
for strs in data:
if strs in new_dict.keys():
new_dict[strs] = new_dict[strs]+1
else:
new_dict[strs] = 1
#count_list=sorted(new_dict.items(),key=lambda x:x[1],reverse=True)
plot_name=[]
plot_value=[]
lists=[]
for k in words:
if k in new_dict:
plot_name.append(k)
plot_value.append(new_dict[k])
print("%s"%file+" 单词 "+"%s"%k+" 的出现频数为 "+"%s"%new_dict[k]+" 次")
else:
print("%s"%file+" 单词 "+"%s"%k+" 未出现!")
plot_name.append(k)
plot_value.append(0)
bar=Bar()
bar.add_xaxis(plot_name)
bar.add_yaxis("词语出现次数", plot_value)
#bar.add("词语出现次数", plot_name,plot_value,is_label_show=True, is_datazoom_show=False, xaxis_rotate=30)
bar.set_global_opts(title_opts=opts.TitleOpts(title="词频统计"))
file_abb=file.replace('\.docx','')
name="%s"%file_abb+"-"+"汇总词频统计"
bar.render('%s.html'%name)
for k in range(len(plot_name)):
lists.append([plot_name[k],plot_value[k]])
with open('%s.txt'%name,'w') as f:
f.write('词语,频数'+'\n')
for i in lists:
i=str(i).strip('[').strip(']').replace('\'','')
#print(i)
f.write(i+'\n')