日民人报语料库的汉字词频统计 with JAVA

2020-04-19  本文已影响0人  Lairai

实在用不惯perl...就用Java写了(我先用notepad转码成了UTF-8)

package homework3;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ChineseWordFrequency {
    Map freqDic = new HashMap<String, Integer>();
    String markPattern = "\\[?[\\u4e00-\\u9fa5]+/[a-z].*"; //标注模式
    String wordPattern = "[\\u4e00-\\u9fa5]+";              //汉字词模式

    public void getFrequency(String corpus) throws IOException{
        BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
        String line;

        // 建立词典
        while ((line = corpusReader.readLine()) != null) {
            String[] marks = line.split("\\s+");
            for (String mark: marks) {
                //System.out.println(mark);
                if (Pattern.matches(markPattern, mark)) {
                    // 找到目标词语
                    Matcher matcher = Pattern.compile(wordPattern).matcher(mark);
                    if (matcher.find()) {
                        String word = matcher.group(0);
                        //System.out.println(word);
                        if (freqDic.containsKey(word)) {
                            freqDic.put(word, (Integer)freqDic.get(word) + 1);
                        } else {
                            freqDic.put(word, 1);
                        }
                    }
                }
            }
        }

        // 根据频率升序排序
        freqDic = sortByValueAscending(freqDic);
    }

    public void printResult() throws Exception{
        File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\1.txt");
        resFile.createNewFile();
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
        writer.append("Size: " + freqDic.size() + '\n');
        Set<Map.Entry<String, Integer>> entrySet = freqDic.entrySet();
        for (Map.Entry<String, Integer> entry : entrySet) {
            writer.append(entry.getKey() + "  "+entry.getValue() + '\n');
        }
        writer.close();
    }

    //降序排序
    private <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map)
    {
        List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<K, V>>()
        {
            @Override
            public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
            {
                int compare = (o1.getValue()).compareTo(o2.getValue());
                return -compare;
            }
        });

        Map<K, V> result = new LinkedHashMap<K, V>();
        for (Map.Entry<K, V> entry : list) {
            result.put(entry.getKey(), entry.getValue());
        }
        return result;
    }
}
上一篇下一篇

猜你喜欢

热点阅读