分词

2020-02-06  本文已影响0人  月夜星空下
import jieba
import re
word = input("请输入:")
# 创建停用词列表
def stopwordslist():
   stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
   return stopwords

cleaned_data = re.findall(u"[\u4e00-\u9fa5]+",word)
# print(cleaned_data)
r = ''
for ic in cleaned_data:
   b = str(cleaned_data)
   i = str(ic)
   r += ic
a = jieba.lcut(r)
print("正在分词...")
# print(a)
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in a:
   if word not in stopwords:
       if word != '\t':
           outstr += word
           outstr += " "
# outstr = outstr.replace(" ","")
print(outstr)
print(type(outstr))
上一篇下一篇

猜你喜欢

热点阅读