分词 jieba - python笔记

2020-08-14  本文已影响0人  自走炮
import jieba

# 一般过滤
def chinese_cut1(text):
    return ' '.join(jieba.cut(text, cut_all = False)) # 精确模式

datacutted = data.apply(chinese_cut1)

词性过滤

import jieba.posseg

# 词性过滤
def chinese_cut2(text):
    result = jieba.posseg.cut(text)
    return ' '.join(x.word for x in result if x.flag == 'a' or x.flag == 'n' or x.flag == 'v')

datacutted = data.apply(chinese_cut2)

自定义词典

jieba.load_userdict('dict.txt') # 自定义词典

# 动态修改词典
jieba.add_word('newword', freq = 10, tag = 'nz') # 添加自定义词
jieba.del_word('word') # 删除自定义词

jieba.suggest_freq(line.strip(), True) for line in open('dict.txt', 'r', encoding = 'utf8') # 批量修改词频
上一篇下一篇

猜你喜欢

热点阅读