Python词频统计

2022-01-11  本文已影响0人  云上小白鸽

1.合并数据文件

import os
import os.path   #文件夹遍历函数  

files = os.listdir('./raw_data')  #特定目录下的文件存入列表
f=open('result.txt','w')    #打开当前目录下的result.txt文件,如果没有则创建

for file in files:
   filepath = './raw_data/'+file
   for line in open(filepath):     #遍历单个文件,读取行数
       f.writelines(line)
   f.write('\n')

f.close()

2.词频统计

import re
import jieba
from collections import Counter
import csv

# 读入数据文件文件
content = open('all_data.txt',encoding="gbk").read()

#数据清理
content = re.sub(r'\n+','',content) #去除换行符
content = re.sub(r'\W+',' ',content) #符号替换为空白
content = re.sub(r' +','',content)  #去除空格

#分词
seg_list = list(jieba.cut(content))
#print("分词结果: \n","/".join(seg_list[:99])) 

#去停用词
stopwords = open('stopwords.txt',encoding="utf-8").read() 
stopwords = stopwords.split('\n')       #字符串按'\n'分割,构建列表类型
#print("停用词: \n",",".join(stopwords[:20]))      #显示部分停用词,第一个为空格
final_content = []
for seg in seg_list:
    if seg not in stopwords:
        final_content.append(seg)
#print("分词结果: \n","/".join(final_content[:99]))     #显示部分处理结果

#词频统计
counting_words = Counter(final_content)
common_words = counting_words.most_common(50)
common_words.sort(key = lambda x:x[1], reverse = True)
#print(commo_words)

#词频写入csv
with open('word_excel.csv', 'w', encoding = 'utf-8', newline = '') as csvfile:
    write = csv.writer(csvfile)  #创建一个csv的writer对象用于写每一行内容
    write.writerow(['词组','词频'])  #写表格表头
    write.writerows(common_words)

上一篇下一篇

猜你喜欢

热点阅读