PythonShowMeTheCode(0006): 统计重要词

2016-08-23  本文已影响0人  tyrone_li

1. 题目

第 0006 题:你有一个目录,放了你一个月的日记,都是 txt,为了避免分词的问题,假设内容都是英文,请统计出你认为每篇日记最重要的词。

2. 扩展

3. 实现堆

class MinHeap:
    def __init__(self):
        self.heap_list = [()]
        self.size = 0

    def compare(self, item1, item2):
        if item1[1] < item2[1]:
            return 1
        elif item1[1] == item2[1]:
            return 0
        else:
            return -1

    def flow_up(self, i):
        while i // 2 > 0:
            if self.compare(self.heap_list[i], self.heap_list[i//2]) < 0:
                tmp = self.heap_list[i//2]
                self.heap_list[i//2] = self.heap_list[i]
                self.heap_list[i] = tmp
            i //= 2

    def insert(self, item):
        self.heap_list.append(item)
        self.size += 1
        self.flow_up(self.size)

    def flow_down(self, i):
        while i*2 <= self.size:
            min_child = self.get_min_child(i)
            if self.compare(self.heap_list[i], self.heap_list[min_child]) > 0:
                tmp = self.heap_list[min_child]
                self.heap_list[min_child] = self.heap_list[i]
                self.heap_list[i] = tmp
            i = min_child

    def get_min_child(self, i):
        if i*2+1 > self.size:
            return i*2
        else:
            if self.compare(self.heap_list[i*2], self.heap_list[i*2+1]) < 0:
                return i*2
            else:
                return i*2+1

    def pop_min(self):
        min_item = self.heap_list[1]
        self.heap_list[1] = self.heap_list[self.size]
        self.size -= 1
        self.heap_list.pop()
        self.flow_down(1)
        return min_item

    def build_heap(self, word_dict):
        i = len(word_dict) // 2
        self.size = len(word_dict)
        for item in word_dict.items():
            self.heap_list.append(item)
        while i > 0:
            self.flow_down(i)
            i -= 1

注意:当需要使用堆时,只需要继承这个类,重写compare()build_heap()方法即可。

4. 实现选词

# -*- coding:utf-8 -*-
import re
import os
import os.path
from min_heap import MinHeap


def get_word_dic(file_path=None):
    if file_path is None:
        print("Error")
        return
    word_dict = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file.readlines():
            words = re.findall(r"[a-z\'_-]+\b", line.lower())
            for word in words:
                if word not in word_dict:
                    word_dict[word] = 1
                else:
                    word_dict[word] += 1
    return word_dict


def get_top_k_words(word_dict, k):
    result = []
    dont_count = get_not_important_word_list("not-important-words.txt")
    min_heap = MinHeap()
    min_heap.build_heap(word_dict)
    while k > 0:
        item = min_heap.pop_min()
        if item[0] not in dont_count:
            result.append(item)
            k -= 1
    return result


def get_not_important_word_list(path):
    with open(path, "r", encoding="utf-8") as file:
        words = re.findall(r"[a-z\'_-]+", file.read().lower())
    return words


def get_words_important_dict(dir_path):
    if not os.path.isdir(dir_path):
        print("plz input path")
        return

    files = [os.path.join(dir_path, x) for x in os.listdir(dir_path) if os.path.splitext(x)[1] == ".txt" and x != "not-important-words.txt"]

    for file in files:
        print("File: " + file)
        word_dict = get_word_dic(file)
        print(get_top_k_words(word_dict, 10))


if __name__ == "__main__":
    get_words_important_dict(".")
上一篇 下一篇

猜你喜欢

热点阅读