You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/09 00:47:59 UTC

svn commit: r1228994 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/dict/ resources/org/apache/lucene/analysis/kuromoji/dict/ tools/java/org/apache/lucene/analysis/kuromoji/util/ tools/test/...

Author: rmuir
Date: Sun Jan  8 23:47:58 2012
New Revision: 1228994

URL: http://svn.apache.org/viewvc?rev=1228994&view=rev
Log:
LUCENE-3305: sort all input to make the targetmap always increasing list of deltas (~1MB smaller jar)

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Sun Jan  8 23:47:58 2012
@@ -60,21 +60,22 @@ public abstract class BinaryDictionary i
       DataInput in = new InputStreamDataInput(mapIS);
       CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
       targetMap = new int[in.readVInt()][];
-      for (int j = 0; j < targetMap.length;) {
+      int accum = 0;
+      for (int j = 0; j < targetMap.length; j++) {
         final int len = in.readVInt();
-        if (len == 0) {
-          // decode RLE: number of nulls
-          j += in.readVInt();
+        final int a[];
+        if ((len & 1) == 1) {
+          a = new int[1];
+          accum += len >>> 1;
+          a[0] = accum;
         } else {
-          final int[] a = new int[len];
-          int accum = 0;
-          for (int i = 0; i < len; i++) {
+          a = new int[len >>> 1];
+          for (int i = 0; i < a.length; i++) {
             accum += in.readVInt();
             a[i] = accum;
           }
-          targetMap[j] = a;
-          j++;
         }
+        targetMap[j] = a;
       }
       
       posIS = getClass().getResourceAsStream(getClass().getSimpleName() + POSDICT_FILENAME_SUFFIX);

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Sun Jan  8 23:47:58 2012
@@ -158,7 +158,16 @@ public abstract class BinaryDictionaryWr
     }
   }
   
+  // only for assert
+  int lastWordId = -1;
+  int lastSourceId = 0;
+
   public void addMapping(int sourceId, int wordId) {
+    assert sourceId >= lastSourceId;
+    assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
+    lastSourceId = sourceId;
+    lastWordId = wordId;
+    
     if(targetMap.length <= sourceId) {
       final int newSize = ArrayUtil.oversize(sourceId + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
       int[][] newArray = new int[newSize][];
@@ -211,35 +220,27 @@ public abstract class BinaryDictionaryWr
       final DataOutput out = new OutputStreamDataOutput(os);
       CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
       out.writeVInt(targetMapSize);
-      int nulls = 0;
+      int prev = 0;
       for (int j = 0; j < targetMapSize; j++) {
         final int size = targetMapComponentSizes[j];
-        if (size == 0) {
-          // run-length encoding for all nulls:
-          if (nulls == 0) {
-            out.writeVInt(0);
-          }
-          nulls++;
+        // note: size is 0 for ONLY wordID 0 of TokenInfoDictionary
+        // this is because the FST uses 0 for NO_OUTPUT... 
+        if (size == 1) {
+          int delta = targetMap[j][0] - prev;
+          assert delta >= 0;
+          out.writeVInt(delta << 1 | 1);
+          prev += delta;
         } else {
-          if (nulls > 0) {
-            out.writeVInt(nulls);
-            nulls = 0;
-          }
+          out.writeVInt(size << 1);
           final int[] a = targetMap[j];
-          Arrays.sort(a, 0, size);
-          assert size > 0 && size <= a.length;
-          out.writeVInt(size);
-          int prev = 0;
           for (int i = 0; i < size; i++) {
-            out.writeVInt(a[i] - prev);
-            prev = a[i];
+            int delta = a[i] - prev;
+            assert delta >= 0;
+            out.writeVInt(delta);
+            prev += delta;
           }
         }
       }
-      // write the pending RLE count:
-      if (nulls > 0) {
-        out.writeVInt(nulls);
-      }
     } finally {
       os.close();
     }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Sun Jan  8 23:47:58 2012
@@ -27,8 +27,10 @@ import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Map.Entry;
 import java.util.Set;
@@ -87,6 +89,8 @@ public class TokenInfoDictionaryBuilder 
   public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
     TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
     
+    // all lines in the file
+    List<String[]> lines = new ArrayList<String[]>();
     for (File file : csvFiles){
       FileInputStream inputStream = new FileInputStream(file);
       Charset cs = Charset.forName(encoding);
@@ -103,15 +107,7 @@ public class TokenInfoDictionaryBuilder 
           System.out.println("Entry in CSV is not valid: " + line);
           continue;
         }
-        int next = dictionary.put(formatEntry(entry));
-        
-        if(next == offset){
-          System.out.println("Failed to process line: " + line);
-          continue;
-        }
-        
-        dictionaryEntries.put(offset, entry[0]);
-        offset = next;
+        lines.add(formatEntry(entry));
         
         // NFKC normalize dictionary entry
         if (normalizeEntries) {
@@ -122,14 +118,32 @@ public class TokenInfoDictionaryBuilder 
           for (int i = 0; i < entry.length; i++) {
             normalizedEntry[i] = normalizer.normalize(entry[i]);
           }
-          
-          next = dictionary.put(formatEntry(normalizedEntry));
-          dictionaryEntries.put(offset, normalizedEntry[0]);
-          offset = next;
+            
+          lines.add(formatEntry(normalizedEntry));
         }
       }
     }
     
+    // sort by term
+    Collections.sort(lines, new Comparator<String[]>() {
+      public int compare(String[] left, String[] right) {
+        return left[0].compareTo(right[0]);
+      }
+    });
+    
+    for (String[] entry : lines) {
+      int next = dictionary.put(entry);
+        
+      if(next == offset){
+        System.out.println("Failed to process line: " + Arrays.toString(entry));
+        continue;
+      }
+        
+      dictionaryEntries.put(offset, entry[0]);
+      offset = next;
+    }
+    
+    // TODO: we can do this in parallel
     System.out.print("  building FST...");
     FST<Long> fst = buildFST();
     dictionary.setFST(fst);
@@ -236,6 +250,7 @@ public class TokenInfoDictionaryBuilder 
   private FST<Long> buildFST() throws IOException {    
     FST<Long> words;
     Collection<String> values = dictionaryEntries.values();
+    // TODO: we don't need to sort again, we could just check != last
     TreeSet<String> unique = new TreeSet<String>(values);
     PositiveIntOutputs o = PositiveIntOutputs.getSingleton(true);
     Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE2, o);

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Sun Jan  8 23:47:58 2012
@@ -25,6 +25,12 @@ import java.io.LineNumberReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
 
 public class UnknownDictionaryBuilder {
   private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
@@ -60,11 +66,24 @@ public class UnknownDictionaryBuilder {
     
     dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
     
+    List<String[]> lines = new ArrayList<String[]>();
     String line = null;
     while ((line = lineReader.readLine()) != null) {
       // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
       // even though the unknown dictionary returns hardcoded null here.
-      dictionary.put(CSVUtil.parse(line + ",*,*")); // Probably we don't need to validate entry
+      lines.add(CSVUtil.parse(line + ",*,*")); // Probably we don't need to validate entry
+    }
+    
+    Collections.sort(lines, new Comparator<String[]>() {
+      public int compare(String[] left, String[] right) {
+        int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
+        int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
+        return leftId - rightId;
+      }
+    });
+    
+    for (String[] entry : lines) {
+      dictionary.put(entry);
     }
     
     return dictionary;

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java Sun Jan  8 23:47:58 2012
@@ -59,14 +59,14 @@ public class UnknownDictionaryTest exten
     } catch(Exception e){
       
     }
-    
-    String entry1 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*";
-    String entry2 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
-    String entry3 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";
-    
-    unkDic.putCharacterCategory(0, "KANJI");
-    unkDic.putCharacterCategory(1, "ALPHA");
-    unkDic.putCharacterCategory(2, "HIRAGANA");
+
+    String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
+    String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";
+    String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*";
+
+    unkDic.putCharacterCategory(0, "ALPHA");
+    unkDic.putCharacterCategory(1, "HIRAGANA");
+    unkDic.putCharacterCategory(2, "KANJI");
     
     unkDic.put(CSVUtil.parse(entry1));
     unkDic.put(CSVUtil.parse(entry2));