关键词提取之TF-IDF

2019-11-11  本文已影响0人  乘瓠散人

TF-IDF: term frequency - inverse document frequency
in order to measure the importance of a word in the document set

方法1

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = ['first second third',
          'second third third third']

vectorizer = TfidfVectorizer(smooth_idf=False)
tfidf_matrix = vectorizer.fit_transform(corpus)

print('tfidf matrix:')
print(tfidf_matrix.toarray())

print('all terms:', vectorizer.get_feature_names())

res = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names())
print('doc-term matrix:')
print(res)

运行结果:

tfidf matrix:
[[0.76749457 0.45329466 0.45329466]
 [0.         0.31622777 0.9486833 ]]
all terms: ['first', 'second', 'third']
doc-term matrix:
      first    second     third
0  0.767495  0.453295  0.453295
1  0.000000  0.316228  0.948683

由此可见,得到的terms都是distinct的,因此可构成后续的词频表。

方法2

# -*- coding: utf-8 -*-

import jieba
import os
import sys

from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

from tqdm import tqdm
import json
import re


def read_data(input_file):

    # 去掉停用词
    stop_word = [line.rstrip() for line in open('chinese_stopword.txt')]
    print('num of stop_words:', len(stop_word))
    print(stop_word[:5])

    with open(input_file, encoding='utf-8') as f:
        train_list = []
        for l in tqdm(f):
            data = json.loads(l.strip())
            title = data['title']
            content = data['content']
            content = title + '。' + content
            content = content.replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '')
            words_list = jieba.cut(content)
            words_list_after = []
            for w in words_list:
                if w not in stop_word:
                    words_list_after.append(w)

            result = ' '.join(words_list_after)
            print(result)
            train_list.append(result)

    return train_list
    

if __name__ == "__main__":

    train_list = read_data("demo_train.txt")

    vectorizer = CountVectorizer()
    # 将每条文本转成对应的词频向量 a[i][j]表示第i条样本中第j个词的频率
    X = vectorizer.fit_transform(train_list)  # [-1, key_words_num]
    # 词袋模型里的所有词语
    key_words = vectorizer.get_feature_names()  # 所有文本的关键字 

    # 训练
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    weight = tfidf.toarray()  # a[i][j]表示第i条样本中第j个词的tf-idf权重

    for d, wgt in enumerate(weight):
        print('Sample ', d)
        loc = np.argsort(-wgt)
        # 打印权重最大的4个关键词
        for i in range(4):
            print("Keyword-{}:{}".format(key_words[loc[i]], wgt[loc[i]]))

    print("*" * 20)

    # 测试
    test_list = read_data("demo_test.txt")
    X_test = vectorizer.transform(test_list)
    test_tfidf = transformer.transform(X_test)
    test_weight = test_tfidf.toarray()

    with open('test_keywords.txt', 'a+', encoding="utf-8") as f:
        for d, wgt in enumerate(test_weight):
            print('Test Sample ', d)
            loc = np.argsort(-wgt)
            tmp = dict()
            for i in range(4):
                print("Keyword-{}:{}".format(key_words[loc[i]], wgt[loc[i]]))
                k = key_words[loc[i]]
                v = wgt[loc[i]]
                tmp[k] = v

            jsonstr = json.dumps(tmp, ensure_ascii=False)  # 避免中文为ascii码
            f.write(jsonstr + '\n')

https://github.com/wzyonggege/tf-idf
TF-IDF 使用sklearn包实现
tfidf_CountVectorizer 与 TfidfTransformer 保存和测试 - 今夜无风
fit_transform,fit,transform区别和作用详解
python解析字符编码中的Unicode和UTF-8

上一篇下一篇

猜你喜欢

热点阅读