日民人报语料库的汉字词频统计 with JAVA
2020-04-19 本文已影响0人
Lairai
实在用不惯perl...就用Java写了(我先用notepad转码成了UTF-8)
package homework3;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ChineseWordFrequency {
Map freqDic = new HashMap<String, Integer>();
String markPattern = "\\[?[\\u4e00-\\u9fa5]+/[a-z].*"; //标注模式
String wordPattern = "[\\u4e00-\\u9fa5]+"; //汉字词模式
public void getFrequency(String corpus) throws IOException{
BufferedReader corpusReader = new BufferedReader(new FileReader(corpus));
String line;
// 建立词典
while ((line = corpusReader.readLine()) != null) {
String[] marks = line.split("\\s+");
for (String mark: marks) {
//System.out.println(mark);
if (Pattern.matches(markPattern, mark)) {
// 找到目标词语
Matcher matcher = Pattern.compile(wordPattern).matcher(mark);
if (matcher.find()) {
String word = matcher.group(0);
//System.out.println(word);
if (freqDic.containsKey(word)) {
freqDic.put(word, (Integer)freqDic.get(word) + 1);
} else {
freqDic.put(word, 1);
}
}
}
}
}
// 根据频率升序排序
freqDic = sortByValueAscending(freqDic);
}
public void printResult() throws Exception{
File resFile = new File("C:\\Coding\\javaCoding\\NLP\\src\\homework3\\1.txt");
resFile.createNewFile();
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(resFile), "UTF-8"));
writer.append("Size: " + freqDic.size() + '\n');
Set<Map.Entry<String, Integer>> entrySet = freqDic.entrySet();
for (Map.Entry<String, Integer> entry : entrySet) {
writer.append(entry.getKey() + " "+entry.getValue() + '\n');
}
writer.close();
}
//降序排序
private <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map)
{
List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<K, V>>()
{
@Override
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
{
int compare = (o1.getValue()).compareTo(o2.getValue());
return -compare;
}
});
Map<K, V> result = new LinkedHashMap<K, V>();
for (Map.Entry<K, V> entry : list) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
}