采用Nagao算法统计各个子字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。
名词解释:
Nagao算法:一种快速的统计文本里所有子字符串频次的算法。详细算法可见http://www.doc88.com/p-664123446503.html
词频:该字符串在文档中出现的次数。出现次数越多越重要。
左右邻个数:文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越高。
左右熵:文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标,有一定区别。
交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各自独立出现的概率,最后取所有的划分里面概率最小值。这个值越大,说明字符串内部凝聚度越高,越可能成词。
算法具体流程:
1. 将输入文件逐行读入,按照非汉字([^\u4E00-\u9FA5]+)以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”,
分成一个个字符串,代码如下:
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停用词可以修改。
2. 获取所有切分后的字符串的左子串和右子串,分别加入左、右PTable
3. 对PTable排序,并计算LTable。LTable记录的是,排序后的PTable中,下一个子串同上一个子串具有相同字符的数量
4. 遍历PTable和LTable,即可得到所有子字符串的词频、左右邻
5. 根据所有子字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息
1. NagaoAlgorithm.java
package com.algo.word; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class NagaoAlgorithm { private int N; private List<String> leftPTable; private int[] leftLTable; private List<String> rightPTable; private int[] rightLTable; private double wordNumber; private Map<String, TFNeighbor> wordTFNeighbor; private final static String stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说"; private NagaoAlgorithm(){ //default N = 5 N = 5; leftPTable = new ArrayList<String>(); rightPTable = new ArrayList<String>(); wordTFNeighbor = new HashMap<String, TFNeighbor>(); } //reverse phrase private String reverse(String phrase) { StringBuilder reversePhrase = new StringBuilder(); for (int i = phrase.length() - 1; i >= 0; i--) reversePhrase.append(phrase.charAt(i)); return reversePhrase.toString(); } //co-prefix length of s1 and s2 private int coPrefixLength(String s1, String s2){ int coPrefixLength = 0; for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){ if(s1.charAt(i) == s2.charAt(i)) coPrefixLength++; else break; } return coPrefixLength; } //add substring of line to pTable private void addToPTable(String line){ //split line according to consecutive none Chinese character String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]"); for(String phrase : phrases){ for(int i = 0; i < phrase.length(); i++) rightPTable.add(phrase.substring(i)); String reversePhrase = reverse(phrase); for(int i = 0; i < reversePhrase.length(); i++) leftPTable.add(reversePhrase.substring(i)); wordNumber += phrase.length(); } } //count lTable private void countLTable(){ Collections.sort(rightPTable); rightLTable = new int[rightPTable.size()]; for(int i = 1; i < rightPTable.size(); i++) rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i)); Collections.sort(leftPTable); leftLTable = new int[leftPTable.size()]; for(int i = 1; i < leftPTable.size(); i++) leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i)); System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable"); } //according to pTable and lTable, count statistical result: TF, neighbor distribution private void countTFNeighbor(){ //get TF and right neighbor for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){ String phrase = rightPTable.get(pIndex); for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = phrase.substring(0, length); TFNeighbor tfNeighbor = new TFNeighbor(); tfNeighbor.incrementTF(); if(phrase.length() > length) tfNeighbor.addToRightNeighbor(phrase.charAt(length)); for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){ if(rightLTable[lIndex] >= length){ tfNeighbor.incrementTF(); String coPhrase = rightPTable.get(lIndex); if(coPhrase.length() > length) tfNeighbor.addToRightNeighbor(coPhrase.charAt(length)); } else break; } wordTFNeighbor.put(word, tfNeighbor); } } //get left neighbor for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){ String phrase = leftPTable.get(pIndex); for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = reverse(phrase.substring(0, length)); TFNeighbor tfNeighbor = wordTFNeighbor.get(word); if(phrase.length() > length) tfNeighbor.addToLeftNeighbor(phrase.charAt(length)); for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){ if(leftLTable[lIndex] >= length){ String coPhrase = leftPTable.get(lIndex); if(coPhrase.length() > length) tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length)); } else break; } } } System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor"); } //according to wordTFNeighbor, count MI of word private double countMI(String word){ if(word.length() <= 1) return 0; double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber; List<Double> mi = new ArrayList<Double>(word.length()); for(int pos = 1; pos < word.length(); pos++){ String leftPart = word.substring(0, pos); String rightPart = word.substring(pos); double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber; double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber; mi.add(coProbability/(leftProbability*rightProbability)); } return Collections.min(mi); } //save TF, (left and right) neighbor number, neighbor entropy, mutual information private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){ try { //read stop words file Set<String> stopWords = new HashSet<String>(); BufferedReader br = new BufferedReader(new FileReader(stopList)); String line; while((line = br.readLine()) != null){ if(line.length() > 1) stopWords.add(line); } br.close(); //output words TF, neighbor info, MI BufferedWriter bw = new BufferedWriter(new FileWriter(out)); for(Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){ if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue; TFNeighbor tfNeighbor = entry.getValue(); int tf, leftNeighborNumber, rightNeighborNumber; double mi; tf = tfNeighbor.getTF(); leftNeighborNumber = tfNeighbor.getLeftNeighborNumber(); rightNeighborNumber = tfNeighbor.getRightNeighborNumber(); mi = countMI(entry.getKey()); if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) && rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){ StringBuilder sb = new StringBuilder(); sb.append(entry.getKey()); sb.append(",").append(tf); sb.append(",").append(leftNeighborNumber); sb.append(",").append(rightNeighborNumber); sb.append(",").append(tfNeighbor.getLeftNeighborEntropy()); sb.append(",").append(tfNeighbor.getRightNeighborEntropy()); sb.append(",").append(mi).append("\n"); bw.write(sb.toString()); } } bw.close(); } catch (IOException e) { throw new RuntimeException(e); } System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file"); } //apply nagao algorithm to input file public static void applyNagao(String[] inputs, String out, String stopList){ NagaoAlgorithm nagao = new NagaoAlgorithm(); //step 1: add phrases to PTable String line; for(String in : inputs){ try { BufferedReader br = new BufferedReader(new FileReader(in)); while((line = br.readLine()) != null){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable"); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(",")); } public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){ NagaoAlgorithm nagao = new NagaoAlgorithm(); nagao.setN(n); String[] threshold = filter.split(","); if(threshold.length != 4){ System.out.println("ERROR: filter must have 4 numbers, seperated with ',' "); return; } //step 1: add phrases to PTable String line; for(String in : inputs){ try { BufferedReader br = new BufferedReader(new FileReader(in)); while((line = br.readLine()) != null){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable"); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, threshold); } private void setN(int n){ N = n; } public static void main(String[] args) { String[] ins = {"E://test//ganfen.txt"}; applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt"); } }
2. TFNeighbor.java
package com.algo.word; import java.util.HashMap; import java.util.Map; public class TFNeighbor { private int tf; private Map<Character, Integer> leftNeighbor; private Map<Character, Integer> rightNeighbor; TFNeighbor(){ leftNeighbor = new HashMap<Character, Integer>(); rightNeighbor = new HashMap<Character, Integer>(); } //add word to leftNeighbor public void addToLeftNeighbor(char word){ //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0)); Integer number = leftNeighbor.get(word); leftNeighbor.put(word, number == null? 1: 1+number); } //add word to rightNeighbor public void addToRightNeighbor(char word){ //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0)); Integer number = rightNeighbor.get(word); rightNeighbor.put(word, number == null? 1: 1+number); } //increment tf public void incrementTF(){ tf++; } public int getLeftNeighborNumber(){ return leftNeighbor.size(); } public int getRightNeighborNumber(){ return rightNeighbor.size(); } public double getLeftNeighborEntropy(){ double entropy = 0; int sum = 0; for(int number : leftNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if(sum == 0) return 0; return Math.log(sum) - entropy/sum; } public double getRightNeighborEntropy(){ double entropy = 0; int sum = 0; for(int number : rightNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if(sum == 0) return 0; return Math.log(sum) - entropy/sum; } public int getTF(){ return tf; } }
3. Main.java
package com.algo.word; public class Main { public static void main(String[] args) { //if 3 arguments, first argument is input files splitting with ',' //second argument is output file //output 7 columns split with ',' , like below: //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information //third argument is stop words list if(args.length == 3) NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]); //if 4 arguments, forth argument is the NGram parameter N //5th argument is threshold of output words, default is "20,3,3,5" //output TF > 20 && (left | right) neighbor number > 3 && MI > 5 else if(args.length == 5) NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]); } }
以上所述就是本文的全部内容了,希望大家能够喜欢。