You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/16 20:22:51 UTC
svn commit: r1232122 - in /lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji: ./ src/java/org/apache/lucene/analysis/kuromoji/dict/ src/resources/org/apache/lucene/analysis/kuromoji/dict/ src/tools/java/org/apache/lucene/analysis/kuromoji...

Author: rmuir
Date: Mon Jan 16 19:22:50 2012
New Revision: 1232122

URL: http://svn.apache.org/viewvc?rev=1232122&view=rev
Log:
LUCENE-3699: simplify dictionary access and reduce tokeninfodictionary 1.5MB

Removed:
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
Modified:
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Mon Jan 16 19:22:50 2012
@@ -37,12 +37,10 @@ public abstract class BinaryDictionary i
   public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
   public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
   public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
-  public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
   
   public static final String DICT_HEADER = "kuromoji_dict";
   public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
   public static final String POSDICT_HEADER = "kuromoji_dict_pos";
-  public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
   public static final int VERSION = 1;
   
   private final ByteBuffer buffer;
@@ -52,7 +50,7 @@ public abstract class BinaryDictionary i
   private final String[] inflFormDict;
   
   protected BinaryDictionary() throws IOException {
-    InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
+    InputStream mapIS = null, dictIS = null, posIS = null;
     IOException priorE = null;
     int[] targetMapOffsets = null, targetMap = null;
     String[] posDict = null;
@@ -85,25 +83,24 @@ public abstract class BinaryDictionary i
       posIS = new BufferedInputStream(posIS);
       in = new InputStreamDataInput(posIS);
       CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
-      posDict = new String[in.readVInt()];
-      for (int j = 0; j < posDict.length; j++) {
+      int posSize = in.readVInt();
+      posDict = new String[posSize];
+      inflTypeDict = new String[posSize];
+      inflFormDict = new String[posSize];
+      for (int j = 0; j < posSize; j++) {
         posDict[j] = in.readString();
-      }
-      posIS.close(); posIS = null;
-      
-      inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
-      inflIS = new BufferedInputStream(inflIS);
-      in = new InputStreamDataInput(inflIS);
-      CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
-      int length = in.readVInt();
-      inflTypeDict = new String[length];
-      inflFormDict = new String[length];
-      for (int j = 0; j < length; j++) {
         inflTypeDict[j] = in.readString();
         inflFormDict[j] = in.readString();
+        // this is how we encode null inflections
+        if (inflTypeDict[j].length() == 0) {
+          inflTypeDict[j] = null;
+        }
+        if (inflFormDict[j].length() == 0) {
+          inflFormDict[j] = null;
+        }
       }
-      inflIS.close(); inflIS = null;
-
+      posIS.close(); posIS = null;
+      
       dictIS = getResource(DICT_FILENAME_SUFFIX);
       // no buffering here, as we load in one large buffer
       in = new InputStreamDataInput(dictIS);
@@ -120,7 +117,7 @@ public abstract class BinaryDictionary i
     } catch (IOException ioe) {
       priorE = ioe;
     } finally {
-      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
+      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
     }
     
     this.targetMap = targetMap;
@@ -152,27 +149,27 @@ public abstract class BinaryDictionary i
   
   //@Override	
   public int getLeftId(int wordId) {
-    return buffer.getShort(wordId);
+    return buffer.getShort(wordId) >>> 2;
   }
   
   //@Override
   public int getRightId(int wordId) {
-    return buffer.getShort(wordId + 2);	// Skip left id
+    return buffer.getShort(wordId) >>> 2;
   }
   
   //@Override
   public int getWordCost(int wordId) {
-    return buffer.getShort(wordId + 4);	// Skip left id and right id
+    return buffer.getShort(wordId + 2);	// Skip id
   }
 
   //@Override
   public String getBaseForm(int wordId) {
-    int offset = baseFormOffset(wordId);
-    int length = (buffer.get(offset++) & 0xff) >>> 1;
-    if (length == 0) {
-      return null; // same as surface form
-    } else {
+    if (hasBaseFormData(wordId)) {
+      int offset = baseFormOffset(wordId);
+      int length = buffer.get(offset++) & 0xff;
       return readString(offset, length, false);
+    } else {
+      return null;
     }
   }
   
@@ -185,8 +182,7 @@ public abstract class BinaryDictionary i
   
   //@Override
   public String getPartOfSpeech(int wordId) {
-    int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
-    return posDict[posIndex >>> 1];
+    return posDict[getLeftId(wordId)];
   }
   
   //@Override
@@ -202,28 +198,26 @@ public abstract class BinaryDictionary i
   
   //@Override
   public String getInflectionType(int wordId) {
-    int index = getInflectionIndex(wordId);
-    return index < 0 ? null : inflTypeDict[index];
+    return inflTypeDict[getLeftId(wordId)];
   }
 
   //@Override
   public String getInflectionForm(int wordId) {
-    int index = getInflectionIndex(wordId);
-    return index < 0 ? null : inflFormDict[index];
-  }
-  
-  private static int posOffset(int wordId) {
-    return wordId + 6;
+    return inflFormDict[getLeftId(wordId)];
   }
   
   private static int baseFormOffset(int wordId) {
-    return wordId + 7;
+    return wordId + 4;
   }
   
   private int readingOffset(int wordId) {
     int offset = baseFormOffset(wordId);
-    int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
-    return offset + baseFormLength;
+    if (hasBaseFormData(wordId)) {
+      int baseFormLength = buffer.get(offset++) & 0xff;
+      return offset + (baseFormLength << 1);
+    } else {
+      return offset;
+    }
   }
   
   private int pronunciationOffset(int wordId) {
@@ -238,41 +232,12 @@ public abstract class BinaryDictionary i
     return offset + readingLength;
   }
   
-  private boolean hasPronunciationData(int wordId) {
-    int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
-    return (baseFormData & 1) == 0;
-  }
-  
-  private boolean hasInflectionData(int wordId) {
-    int posData = buffer.get(posOffset(wordId)) & 0xff;
-    return (posData & 1) == 1;
+  private boolean hasBaseFormData(int wordId) {
+    return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
   }
   
-  private int getInflectionIndex(int wordId) {
-    if (!hasInflectionData(wordId)) {
-      return -1; // common case: no inflection data
-    }
-    
-    // skip past reading/pronunciation at the end
-    int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
-    int endData = buffer.get(offset++) & 0xff;
-    
-    final int endLength;
-    if ((endData & 1) == 0) {
-      endLength = endData & 0xfe; // UTF-16: mask off kana bit
-    } else {
-      endLength = endData >>> 1;
-    }
-    
-    offset += endLength;
-    
-    byte b = buffer.get(offset++);
-    int i = b & 0x7F;
-    if ((b & 0x80) == 0) return i;
-    b = buffer.get(offset++);
-    i |= (b & 0x7F) << 7;
-    assert ((b & 0x80) == 0);
-    return i;
+  private boolean hasPronunciationData(int wordId) {
+    return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
   }
   
   private String readString(int offset, int length, boolean kana) {
@@ -288,4 +253,9 @@ public abstract class BinaryDictionary i
     }
     return new String(text);
   }
+  
+  /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
+  public static final int HAS_BASEFORM = 1;
+  /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
+  public static final int HAS_PRONUNCIATION = 2;
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Mon Jan 16 19:22:50 2012
@@ -26,11 +26,6 @@ import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Map;
 
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.OutputStreamDataOutput;
@@ -45,11 +40,7 @@ public abstract class BinaryDictionaryWr
   private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
   private int[] targetMap = new int[8192];
   private int[] targetMapOffsets = new int[8192];
-  private final List<String> posDict = new ArrayList<String>();
-  private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
-  
-  private final List<String> inflDict = new ArrayList<String>();
-  private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
+  private final ArrayList<String> posDict = new ArrayList<String>();
 
   public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
     this.implClazz = implClazz;
@@ -78,35 +69,20 @@ public abstract class BinaryDictionaryWr
         sb.append(part);
       }
     }
-    String pos = sb.toString();
-    Integer posIndex = posDictLookup.get(pos);
-    if (posIndex == null) {
-      posIndex = posDict.size();
-      posDict.add(pos);
-      posDictLookup.put(pos, posIndex);
-      assert posDict.size() == posDictLookup.size();
-    }
+    
+    String posData = sb.toString();
     
     sb.setLength(0);
-    sb.append(CSVUtil.quoteEscape(entry[8]));
+    sb.append(CSVUtil.quoteEscape(posData));
     sb.append(',');
-    sb.append(CSVUtil.quoteEscape(entry[9]));
-    String inflData = sb.toString();
-    
-    Integer inflIndex = Integer.MAX_VALUE;
-    int hasInflData;
-    if ("*,*".equals(inflData)) {
-      hasInflData = 0; // no inflection data
-    } else {
-      hasInflData = 1;
-      inflIndex = inflDictLookup.get(inflData);
-      if (inflIndex == null) {
-        inflIndex = inflDict.size();
-        inflDict.add(inflData);
-        inflDictLookup.put(inflData, inflIndex);
-        assert inflDict.size() == inflDictLookup.size();
-      }
+    if (!"*".equals(entry[8])) {
+      sb.append(CSVUtil.quoteEscape(entry[8]));
     }
+    sb.append(',');
+    if (!"*".equals(entry[9])) {
+      sb.append(CSVUtil.quoteEscape(entry[9]));
+    }
+    String fullPOSData = sb.toString();
     
     String baseForm = entry[10];
     String reading = entry[11];
@@ -114,28 +90,40 @@ public abstract class BinaryDictionaryWr
     
     // extend buffer if necessary
     int left = buffer.remaining();
-    // worst case: three short, 4 bytes, one vint and features (all as utf-16)
-    int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
+    // worst case: two short, 3 bytes, and features (all as utf-16)
+    int worstCase = 4 + 3 + 2*(baseForm.length() + reading.length() + pronunciation.length());
     if (worstCase > left) {
       ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
       buffer.flip();
       newBuffer.put(buffer);
       buffer = newBuffer;
     }
+
+    int flags = 0;
+    if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
+      flags |= BinaryDictionary.HAS_BASEFORM;
+    }
+    if (!pronunciation.equals(reading)) {
+      flags |= BinaryDictionary.HAS_PRONUNCIATION;
+    }
+
+    assert leftId == rightId;
+    assert leftId < 8192; // there are still unused bits
+    // add pos mapping
+    int toFill = 1+leftId - posDict.size();
+    for (int i = 0; i < toFill; i++) {
+      posDict.add(null);
+    }
     
-    buffer.putShort(leftId);
-    buffer.putShort(rightId);
-    buffer.putShort(wordCost);
-    assert posIndex.intValue() < 128;
-    buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
-    
-    int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
+    String existing = posDict.get(leftId);
+    assert existing == null || existing.equals(fullPOSData);
+    posDict.set(leftId, fullPOSData);
     
-    if ("*".equals(baseForm) || baseForm.equals(entry[0])) {
-      buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
-    } else {
-      assert baseForm.length() < 128;
-      buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
+    buffer.putShort((short)(leftId << 2 | flags));
+    buffer.putShort(wordCost);
+
+    if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
+      buffer.put((byte) baseForm.length());
       for (int i = 0; i < baseForm.length(); i++) {
         buffer.putChar(baseForm.charAt(i));
       }
@@ -151,7 +139,7 @@ public abstract class BinaryDictionaryWr
       }
     }
     
-    if (pronunciationIsReading == 0) {
+    if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
       if (isKatakana(pronunciation)) {
         buffer.put((byte) (pronunciation.length() << 1 | 1));
         writeKatakana(pronunciation);
@@ -163,17 +151,6 @@ public abstract class BinaryDictionaryWr
       }
     }
     
-    if (hasInflData > 0) {
-      int key = inflIndex.intValue();
-      assert key < 32768; // note there are really like 300 of these...
-      if (key < 128) {
-        buffer.put((byte) key);
-      } else {
-        buffer.put((byte) ((key & 0x7f) | 0x80));
-        buffer.put((byte) (key >>> 7));
-      }
-    }
-    
     return buffer.position();
   }
   
@@ -229,7 +206,6 @@ public abstract class BinaryDictionaryWr
     writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
     writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
     writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
-    writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
   }
   
   // TODO: maybe this int[] should instead be the output to the FST...
@@ -271,26 +247,17 @@ public abstract class BinaryDictionaryWr
       CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
       out.writeVInt(posDict.size());
       for (String s : posDict) {
-        out.writeString(s);
-      }
-    } finally {
-      os.close();
-    }
-  }
-  
-  protected void writeInflDict(String filename) throws IOException {
-    new File(filename).getParentFile().mkdirs();
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
-      final DataOutput out = new OutputStreamDataOutput(os);
-      CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
-      out.writeVInt(inflDict.size());
-      for (String s : inflDict) {
-        String data[] = CSVUtil.parse(s);
-        assert data.length == 2 : "malformed inflection: " + s;
-        out.writeString(data[0]);
-        out.writeString(data[1]);
+        if (s == null) {
+          out.writeByte((byte)0);
+          out.writeByte((byte)0);
+          out.writeByte((byte)0);
+        } else {
+          String data[] = CSVUtil.parse(s);
+          assert data.length == 3 : "malformed pos/inflection: " + s;
+          out.writeString(data[0]);
+          out.writeString(data[1]);
+          out.writeString(data[2]);
+        }
       }
     } finally {
       os.close();
@@ -313,59 +280,4 @@ public abstract class BinaryDictionaryWr
       os.close();
     }
   }
-  
-  // TODO: the below is messy, but makes the dictionary smaller.
-  // we track frequencies of inflections so the highest-freq ones have smaller indexes.
-
-  /** optional: notes inflection seen in the data up front */
-  public void noteInflection(String entry[]) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(CSVUtil.quoteEscape(entry[8]));
-    sb.append(',');
-    sb.append(CSVUtil.quoteEscape(entry[9]));
-    String s = sb.toString();
-    if ("*,*".equals(s)) {
-      return; // no inflection data
-    }
-    Integer freq = notedInflections.get(s);
-    if (freq == null) {
-      freq = 0;
-    }
-    notedInflections.put(s, freq+1);
-  }
-  
-  /** prepopulates inflection mapping by frequency */
-  public void finalizeInflections() {
-    InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
-    int upto = 0;
-    for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
-      freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
-    }
-    Arrays.sort(freqs, Collections.reverseOrder());
-    for (int i = 0; i < upto; i++) {
-      inflDict.add(freqs[i].inflection);
-      inflDictLookup.put(freqs[i].inflection, i);
-    }
-  }
-  
-  static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
-    String inflection;
-    int freq;
-    
-    InflectionAndFreq(String s, int i) {
-      this.inflection = s;
-      this.freq = i;
-    }
-    
-    public int compareTo(InflectionAndFreq other) {
-      int cmp = freq - other.freq;
-      if (cmp == 0) {
-        return inflection.compareTo(other.inflection);
-      } else {
-        return cmp;
-      }
-    }
-  }
-  
-  private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Mon Jan 16 19:22:50 2012
@@ -101,7 +101,6 @@ public class TokenInfoDictionaryBuilder 
         }
         
         String[] formatted = formatEntry(entry);
-        dictionary.noteInflection(formatted);
         lines.add(formatted);
         
         // NFKC normalize dictionary entry
@@ -115,14 +114,11 @@ public class TokenInfoDictionaryBuilder 
           }
           
           formatted = formatEntry(normalizedEntry);
-          dictionary.noteInflection(formatted);
           lines.add(formatted);
         }
       }
     }
     
-    dictionary.finalizeInflections();
-    
     System.out.println("  sort...");
 
     // sort by term: we sorted the files already and use a stable sort.

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Mon Jan 16 19:22:50 2012
@@ -33,7 +33,7 @@ import java.util.List;
 import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
 
 public class UnknownDictionaryBuilder {
-  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
+  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,è¨å·,ä¸è¬,*,*,*,*,*,*,*";
   
   private String encoding = "euc-jp";
   
@@ -73,9 +73,7 @@ public class UnknownDictionaryBuilder {
       // even though the unknown dictionary returns hardcoded null here.
       final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
       lines.add(parsed);
-      dictionary.noteInflection(parsed); // for completeness; I think unk.def has no inflections...
     }
-    dictionary.finalizeInflections(); // should also be no-op
     
     Collections.sort(lines, new Comparator<String[]>() {
       public int compare(String[] left, String[] right) {