python LDA分析

2022-05-05  本文已影响0人  VivaVida

python 主题模型分析存档


from gensimimport corpora, models

import jieba.possegas jp, jieba

import numpyas np

# 文本集

f =open(r'文档路径',encoding='utf-8')

texts = [[wordfor wordin line.split()]for linein f]

#获得文本个数

M =len(texts)

# 分词过滤条件

flags = ('n','nr','ns','nt','eng','v','d')# 词性

stopwords =open(r'C:\Users\Hikaru\PycharmProjects\pythonPractice\datamining\stopwords.txt')

stopword=['想想','越来越']# 停词

# 分词

words_ls = []

for iin texts:

text=''.join(i)

words = [w.wordfor win jp.cut(text)if w.flagin flagsand w.wordnot in stopwordsand w.wordnot in stopwordand len(w.word)>1]

words_ls.append(words)

# print(words_ls)

text2=[[wordfor wordin line.split()]for linein f]

# 构造词典

dictionary = corpora.Dictionary(words_ls)

# 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】

corpus = [dictionary.doc2bow(words)for wordsin words_ls]

print(dictionary.token2id)

print(corpus)

# 计算文档TF-IDF

corpus_tfidf = models.TfidfModel(corpus)[corpus]

# lda模型,num_topics设置主题的个数

lda = models.ldamodel.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=3,alpha=0.01,eta=0.01)#alpha 主题顺滑度 eta词语顺滑度 越小越宽泛

# 所有文档的主题

for topicin lda.print_topics(num_words=10):

print(topic)

# 主题推断

# for e, values in enumerate(lda.inference(corpus)[0]):

#    print(texts[e])

#    for ee, value in enumerate(values):

#        print('\t主题%d推断值%.2f' % (ee, value))

# # 打印文档的主题分布

# num_show_topic = 5 #每个文档显示前几个主题

# 每个主题的词分布

num_show_term =10  # 每个主题显示几个词

for topic_idin range(20):

print('主题#%d:\t' % topic_id)

term_distribute_all = lda.get_topic_terms(topicid=topic_id)# 所有词的词分布

    term_distribute = term_distribute_all[:10]# 只显示前几个词

    print(term_distribute)

term_distribute = np.array(term_distribute)

term_id = term_distribute[:,0].astype(np.int)

print('词:',end="")

for tin term_id:

print(dictionary.id2token[t],end=' ')

print('概率:',end="")

print(term_distribute[:,1])

#

# # # 主题推断

# print(lda.inference(corpus))

上一篇下一篇

猜你喜欢

热点阅读