日民人报语料库 - 百家姓 with JAVA

2020-04-20  本文已影响0人  Lairai
package homework3;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Task3 {
    static int FAMILY_NAME = 0;
    static int PERSONAL_NAME = 1;
    //Map freqDicName = new HashMap<Name, Integer>();           //姓名频数
    Map freqDicFamilyName = new HashMap<String, Map<Name, Integer>>();      //姓映射到名字及频数的集合
    List<Element> resList = new ArrayList<>();      //用于输出结果的list

    String markPattern = "\\[?[\\u4e00-\\u9fa5]+/nr.*"; //姓or名的标注模式
    String namePattern = "[\\u4e00-\\u9fa5]+";  //姓or名的模式


    public void getFrequency(String corpus) throws IOException{
        BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
        String line;
        String lastFamilyName = null;   // 最近找到的姓
        // 建立词典
        int flag = FAMILY_NAME;     //假设姓总在名前面出现
        while ((line = corpusReader.readLine()) != null) {
            String[] marks = line.split("\\s+");
            for (String mark: marks) {
                //System.out.println(mark);
                String name = null;
                if (Pattern.matches(markPattern, mark)) {
                    // 找到姓名
                    Matcher nameMatcher = Pattern.compile(namePattern).matcher(mark);
                    if (nameMatcher.find()) {
                        name = nameMatcher.group(0);
                        if (flag == FAMILY_NAME) {
                            // 如果找到的是姓
                            lastFamilyName = name;
                            flag = PERSONAL_NAME;
                        } else {
                            // 如果找到的是名,就把这个人的姓名放入姓名索引
                            String personalName = name;
                            Name fullName = new Name(lastFamilyName, personalName);
                            if (freqDicFamilyName.containsKey(lastFamilyName)) {
                                // 如果存在这个姓,就拿到这个姓对应的Map
                                HashMap<Name, Integer> nameMap = (HashMap<Name, Integer>) freqDicFamilyName.get(lastFamilyName);
                                if (nameMap.containsKey(fullName)) {
                                    // 如果已经记录了这个名字,就增加频数
                                    nameMap.put(fullName, (Integer)nameMap.get(fullName) + 1);
                                } else {
                                    // 否则记录这个名字
                                    nameMap.put(fullName, 1);
                                }
                            } else {
                                // 不存在这个姓,则将这个姓和这个名一起放入
                                HashMap<Name, Integer> nameMap = new HashMap<>();
                                nameMap.put(fullName, 1);
                                freqDicFamilyName.put(lastFamilyName, nameMap);
                            }
                            flag = FAMILY_NAME;
                        }
                    }
                }
            }
        }

        // 遍历Map,得到结果
        Set<Map.Entry<String, Map<Name, Integer>>> entrySet = freqDicFamilyName.entrySet();
        for (Map.Entry<String, Map<Name, Integer>> entry : entrySet) {
            int sumFreq = 0;
            Map<Name, Integer> nameMap = entry.getValue();
            for (Integer i : nameMap.values()) sumFreq += i;
            nameMap = sortByValueAscending(nameMap);
            resList.add(new Element(entry.getKey(), sumFreq, nameMap));
        }
        // 对List按要求进行排序
        resList.sort((o1, o2) -> {
            // 按照形式频次降序排列
            return Integer.compare(o2.frequencySum, o1.frequencySum);
        });
    }

    public void printResult() throws Exception{
        File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\3.txt");
        resFile.createNewFile();
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
        writer.append("Size: " + resList.size() + '\n');
        for (Element e : resList) {
            writer.append(e.commonName + " " + e.frequencySum + "   ");
            // 添加前5个
            int count = 0;
            Set<Map.Entry<Name, Integer>> entrySet = e.nameMap.entrySet();
            for (Map.Entry<Name, Integer> entry : entrySet) {
                writer.append(entry.getKey().toString() + " " + entry.getValue() + " ");
                ++count;
                if (count >= 5) break;
            }
            writer.append('\n');
        }
        writer.close();
    }

    private class Name {
        String familyName;      //姓
        String personalName;    //名

        public Name(String familyName, String personalName) {
            this.familyName = familyName;
            this.personalName = personalName;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;

            Name name = (Name) o;

            if (familyName != null ? !familyName.equals(name.familyName) : name.familyName != null) return false;
            return personalName != null ? personalName.equals(name.personalName) : name.personalName == null;
        }

        @Override
        public int hashCode() {
            int result = familyName != null ? familyName.hashCode() : 0;
            result = 31 * result + (personalName != null ? personalName.hashCode() : 0);
            return result;
        }

        @Override
        public String toString() {
            return familyName + personalName;
        }
    }

    private class Element {
        /**
         * 用于放置结果的元素
         */
        String commonName;  //共同的姓
        int frequencySum;       //该姓出现的总频数
        Map<Name, Integer> nameMap; //姓名-频次Map,应已按照频次降序排列

        public Element(String commonName, int frequencySum, Map<Name, Integer> nameMap) {
            this.commonName = commonName;
            this.frequencySum = frequencySum;
            this.nameMap = nameMap;
        }
    }
    //降序排序
    private <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map) {
        List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<K, V>>()
        {
            @Override
            public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
            {
                int compare = (o1.getValue()).compareTo(o2.getValue());
                return -compare;
            }
        });

        Map<K, V> result = new LinkedHashMap<K, V>();
        for (Map.Entry<K, V> entry : list) {
            result.put(entry.getKey(), entry.getValue());
        }
        return result;
    }
}
上一篇 下一篇

猜你喜欢

热点阅读