You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/12 15:28:23 UTC
svn commit: r1230564 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji: dict/UserDictionary.java viterbi/Viterbi.java

Author: rmuir
Date: Thu Jan 12 14:28:23 2012
New Revision: 1230564

URL: http://svn.apache.org/viewvc?rev=1230564&view=rev
Log:
LUCENE-3305: cutover UserDictionary to FST

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1230564&r1=1230563&r2=1230564&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Thu Jan 12 14:28:23 2012
@@ -21,17 +21,26 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
 
 import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
 
 public final class UserDictionary implements Dictionary {
   
-  private final TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
+  private final TokenInfoFST fst;
   
-  private final String featureEntries[];
+  // holds wordid offset, length
+  private final int segmentations[][];
+  // holds readings and POS
+  private final String data[];
   
   private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
   
@@ -45,7 +54,9 @@ public final class UserDictionary implem
     BufferedReader br = new BufferedReader(reader);
     String line = null;
     int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
-    List<String> featureEntries = new ArrayList<String>();
+    List<String[]> featureEntries = new ArrayList<String[]>();
+ 
+    // text, segmentation, readings, POS
     while ((line = br.readLine()) != null) {
       // Remove comments
       line = line.replaceAll("#.*$", "");
@@ -55,6 +66,28 @@ public final class UserDictionary implem
         continue;
       }
       String[] values = CSVUtil.parse(line);
+      featureEntries.add(values);
+    }
+    
+    // TODO: should we allow multiple segmentations per input 'phrase'?
+    // the old treemap didn't support this either, and i'm not sure if its needed/useful?
+
+    Collections.sort(featureEntries, new Comparator<String[]>() {
+      @Override
+      public int compare(String[] left, String[] right) {
+        return left[0].compareTo(right[0]);
+     }
+    });
+    
+    List<String> data = new ArrayList<String>(featureEntries.size());
+    List<int[]> segmentations = new ArrayList<int[]>(featureEntries.size());
+    
+    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
+    Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
+    IntsRef scratch = new IntsRef();
+    long ord = 0;
+    
+    for (String[] values : featureEntries) {
       String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
       String[] readings = values[2].replaceAll("  *", " ").split(" ");
       String pos = values[3];
@@ -68,12 +101,23 @@ public final class UserDictionary implem
       wordIdAndLength[0] = wordId;
       for (int i = 0; i < segmentation.length; i++) {
         wordIdAndLength[i + 1] = segmentation[i].length();
-        featureEntries.add(readings[i] + INTERNAL_SEPARATOR + pos);
+        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
         wordId++;
       }
-      entries.put(values[0], wordIdAndLength);
-    }
-    this.featureEntries = featureEntries.toArray(new String[featureEntries.size()]);
+      // add mapping to FST
+      String token = values[0];
+      scratch.grow(token.length());
+      scratch.length = token.length();
+      for (int i = 0; i < token.length(); i++) {
+        scratch.ints[i] = (int) token.charAt(i);
+      }
+      fstBuilder.add(scratch, fstOutput.get(ord));
+      segmentations.add(wordIdAndLength);
+      ord++;
+    }
+    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
+    this.data = data.toArray(new String[data.size()]);
+    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
   }
   
   /**
@@ -83,20 +127,26 @@ public final class UserDictionary implem
    * @param len length of text
    * @return array of {wordId, position, length}
    */
-  public int[][] lookup(char[] chars, int off, int len) {
-    // TODO: this method should be more efficient.
-    String text = new String(chars, off, len);
+  public int[][] lookup(char[] chars, int off, int len) throws IOException {
+    // TODO: can we avoid this treemap/toIndexArray?
     TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
     
-    for (String keyword : entries.descendingKeySet()) {
-      int offset = 0;
-      int position = text.indexOf(keyword, offset);
-      while (offset < text.length() && position >= 0) {
-        if(!result.containsKey(position)){
-          result.put(position, entries.get(keyword));
+    FST.Arc<Long> arc = new FST.Arc<Long>();
+    int end = off + len;
+    for (int startOffset = off; startOffset < end; startOffset++) {
+      arc = fst.getFirstArc(arc);
+      int output = 0;
+      int remaining = end - startOffset;
+      for (int i = 0; i < remaining; i++) {
+        int ch = chars[startOffset+i];
+        if (fst.findTargetArc(ch, arc, arc, i == 0) == null) {
+          break; // continue to next position
+        }
+        output += arc.output.intValue();
+        if (arc.isFinal()) {
+          output += arc.nextFinalOutput.intValue();
+          result.put(startOffset-off, segmentations[output]);
         }
-        offset += position + keyword.length();
-        position = text.indexOf(keyword, offset);
       }
     }
     
@@ -170,7 +220,7 @@ public final class UserDictionary implem
   }
   
   private String[] getAllFeaturesArray(int wordId) {
-    String allFeatures = featureEntries[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
+    String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
     if(allFeatures == null) {
       return null;
     }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1230564&r1=1230563&r2=1230564&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Thu Jan 12 14:28:23 2012
@@ -293,7 +293,7 @@ public class Viterbi {
    * @param startSizeArr
    * @param endSizeArr
    */
-  private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
+  private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
     int[][] result = userDictionary.lookup(text, offset, len);
     for(int[] segmentation : result) {
       int wordId = segmentation[0];