关键词提取之TF-IDF
2019-11-11 本文已影响0人
乘瓠散人
TF-IDF: term frequency - inverse document frequency
in order to measure the importance of a word in the document set
方法1
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
corpus = ['first second third',
'second third third third']
vectorizer = TfidfVectorizer(smooth_idf=False)
tfidf_matrix = vectorizer.fit_transform(corpus)
print('tfidf matrix:')
print(tfidf_matrix.toarray())
print('all terms:', vectorizer.get_feature_names())
res = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names())
print('doc-term matrix:')
print(res)
运行结果:
tfidf matrix:
[[0.76749457 0.45329466 0.45329466]
[0. 0.31622777 0.9486833 ]]
all terms: ['first', 'second', 'third']
doc-term matrix:
first second third
0 0.767495 0.453295 0.453295
1 0.000000 0.316228 0.948683
由此可见,得到的terms
都是distinct
的,因此可构成后续的词频表。
方法2
# -*- coding: utf-8 -*-
import jieba
import os
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import re
def read_data(input_file):
# 去掉停用词
stop_word = [line.rstrip() for line in open('chinese_stopword.txt')]
print('num of stop_words:', len(stop_word))
print(stop_word[:5])
with open(input_file, encoding='utf-8') as f:
train_list = []
for l in tqdm(f):
data = json.loads(l.strip())
title = data['title']
content = data['content']
content = title + '。' + content
content = content.replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '')
words_list = jieba.cut(content)
words_list_after = []
for w in words_list:
if w not in stop_word:
words_list_after.append(w)
result = ' '.join(words_list_after)
print(result)
train_list.append(result)
return train_list
if __name__ == "__main__":
train_list = read_data("demo_train.txt")
vectorizer = CountVectorizer()
# 将每条文本转成对应的词频向量 a[i][j]表示第i条样本中第j个词的频率
X = vectorizer.fit_transform(train_list) # [-1, key_words_num]
# 词袋模型里的所有词语
key_words = vectorizer.get_feature_names() # 所有文本的关键字
# 训练
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
weight = tfidf.toarray() # a[i][j]表示第i条样本中第j个词的tf-idf权重
for d, wgt in enumerate(weight):
print('Sample ', d)
loc = np.argsort(-wgt)
# 打印权重最大的4个关键词
for i in range(4):
print("Keyword-{}:{}".format(key_words[loc[i]], wgt[loc[i]]))
print("*" * 20)
# 测试
test_list = read_data("demo_test.txt")
X_test = vectorizer.transform(test_list)
test_tfidf = transformer.transform(X_test)
test_weight = test_tfidf.toarray()
with open('test_keywords.txt', 'a+', encoding="utf-8") as f:
for d, wgt in enumerate(test_weight):
print('Test Sample ', d)
loc = np.argsort(-wgt)
tmp = dict()
for i in range(4):
print("Keyword-{}:{}".format(key_words[loc[i]], wgt[loc[i]]))
k = key_words[loc[i]]
v = wgt[loc[i]]
tmp[k] = v
jsonstr = json.dumps(tmp, ensure_ascii=False) # 避免中文为ascii码
f.write(jsonstr + '\n')
https://github.com/wzyonggege/tf-idf
TF-IDF 使用sklearn包实现
tfidf_CountVectorizer 与 TfidfTransformer 保存和测试 - 今夜无风
fit_transform,fit,transform区别和作用详解
python解析字符编码中的Unicode和UTF-8