You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/16 20:22:51 UTC
svn commit: r1232122 - in
/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji: ./
src/java/org/apache/lucene/analysis/kuromoji/dict/
src/resources/org/apache/lucene/analysis/kuromoji/dict/
src/tools/java/org/apache/lucene/analysis/kuromoji...
Author: rmuir
Date: Mon Jan 16 19:22:50 2012
New Revision: 1232122
URL: http://svn.apache.org/viewvc?rev=1232122&view=rev
Log:
LUCENE-3699: simplify dictionary access and reduce tokeninfodictionary 1.5MB
Removed:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Mon Jan 16 19:22:50 2012
@@ -37,12 +37,10 @@ public abstract class BinaryDictionary i
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
- public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
public static final String DICT_HEADER = "kuromoji_dict";
public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
- public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
public static final int VERSION = 1;
private final ByteBuffer buffer;
@@ -52,7 +50,7 @@ public abstract class BinaryDictionary i
private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
- InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
+ InputStream mapIS = null, dictIS = null, posIS = null;
IOException priorE = null;
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
@@ -85,25 +83,24 @@ public abstract class BinaryDictionary i
posIS = new BufferedInputStream(posIS);
in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
- posDict = new String[in.readVInt()];
- for (int j = 0; j < posDict.length; j++) {
+ int posSize = in.readVInt();
+ posDict = new String[posSize];
+ inflTypeDict = new String[posSize];
+ inflFormDict = new String[posSize];
+ for (int j = 0; j < posSize; j++) {
posDict[j] = in.readString();
- }
- posIS.close(); posIS = null;
-
- inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
- inflIS = new BufferedInputStream(inflIS);
- in = new InputStreamDataInput(inflIS);
- CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
- int length = in.readVInt();
- inflTypeDict = new String[length];
- inflFormDict = new String[length];
- for (int j = 0; j < length; j++) {
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
+ // this is how we encode null inflections
+ if (inflTypeDict[j].length() == 0) {
+ inflTypeDict[j] = null;
+ }
+ if (inflFormDict[j].length() == 0) {
+ inflFormDict[j] = null;
+ }
}
- inflIS.close(); inflIS = null;
-
+ posIS.close(); posIS = null;
+
dictIS = getResource(DICT_FILENAME_SUFFIX);
// no buffering here, as we load in one large buffer
in = new InputStreamDataInput(dictIS);
@@ -120,7 +117,7 @@ public abstract class BinaryDictionary i
} catch (IOException ioe) {
priorE = ioe;
} finally {
- IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
+ IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
}
this.targetMap = targetMap;
@@ -152,27 +149,27 @@ public abstract class BinaryDictionary i
//@Override
public int getLeftId(int wordId) {
- return buffer.getShort(wordId);
+ return buffer.getShort(wordId) >>> 2;
}
//@Override
public int getRightId(int wordId) {
- return buffer.getShort(wordId + 2); // Skip left id
+ return buffer.getShort(wordId) >>> 2;
}
//@Override
public int getWordCost(int wordId) {
- return buffer.getShort(wordId + 4); // Skip left id and right id
+ return buffer.getShort(wordId + 2); // Skip id
}
//@Override
public String getBaseForm(int wordId) {
- int offset = baseFormOffset(wordId);
- int length = (buffer.get(offset++) & 0xff) >>> 1;
- if (length == 0) {
- return null; // same as surface form
- } else {
+ if (hasBaseFormData(wordId)) {
+ int offset = baseFormOffset(wordId);
+ int length = buffer.get(offset++) & 0xff;
return readString(offset, length, false);
+ } else {
+ return null;
}
}
@@ -185,8 +182,7 @@ public abstract class BinaryDictionary i
//@Override
public String getPartOfSpeech(int wordId) {
- int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
- return posDict[posIndex >>> 1];
+ return posDict[getLeftId(wordId)];
}
//@Override
@@ -202,28 +198,26 @@ public abstract class BinaryDictionary i
//@Override
public String getInflectionType(int wordId) {
- int index = getInflectionIndex(wordId);
- return index < 0 ? null : inflTypeDict[index];
+ return inflTypeDict[getLeftId(wordId)];
}
//@Override
public String getInflectionForm(int wordId) {
- int index = getInflectionIndex(wordId);
- return index < 0 ? null : inflFormDict[index];
- }
-
- private static int posOffset(int wordId) {
- return wordId + 6;
+ return inflFormDict[getLeftId(wordId)];
}
private static int baseFormOffset(int wordId) {
- return wordId + 7;
+ return wordId + 4;
}
private int readingOffset(int wordId) {
int offset = baseFormOffset(wordId);
- int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
- return offset + baseFormLength;
+ if (hasBaseFormData(wordId)) {
+ int baseFormLength = buffer.get(offset++) & 0xff;
+ return offset + (baseFormLength << 1);
+ } else {
+ return offset;
+ }
}
private int pronunciationOffset(int wordId) {
@@ -238,41 +232,12 @@ public abstract class BinaryDictionary i
return offset + readingLength;
}
- private boolean hasPronunciationData(int wordId) {
- int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
- return (baseFormData & 1) == 0;
- }
-
- private boolean hasInflectionData(int wordId) {
- int posData = buffer.get(posOffset(wordId)) & 0xff;
- return (posData & 1) == 1;
+ private boolean hasBaseFormData(int wordId) {
+ return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
}
- private int getInflectionIndex(int wordId) {
- if (!hasInflectionData(wordId)) {
- return -1; // common case: no inflection data
- }
-
- // skip past reading/pronunciation at the end
- int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
- int endData = buffer.get(offset++) & 0xff;
-
- final int endLength;
- if ((endData & 1) == 0) {
- endLength = endData & 0xfe; // UTF-16: mask off kana bit
- } else {
- endLength = endData >>> 1;
- }
-
- offset += endLength;
-
- byte b = buffer.get(offset++);
- int i = b & 0x7F;
- if ((b & 0x80) == 0) return i;
- b = buffer.get(offset++);
- i |= (b & 0x7F) << 7;
- assert ((b & 0x80) == 0);
- return i;
+ private boolean hasPronunciationData(int wordId) {
+ return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
}
private String readString(int offset, int length, boolean kana) {
@@ -288,4 +253,9 @@ public abstract class BinaryDictionary i
}
return new String(text);
}
+
+ /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
+ public static final int HAS_BASEFORM = 1;
+ /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
+ public static final int HAS_PRONUNCIATION = 2;
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Mon Jan 16 19:22:50 2012
@@ -26,11 +26,6 @@ import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Map;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
@@ -45,11 +40,7 @@ public abstract class BinaryDictionaryWr
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
- private final List<String> posDict = new ArrayList<String>();
- private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
-
- private final List<String> inflDict = new ArrayList<String>();
- private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
+ private final ArrayList<String> posDict = new ArrayList<String>();
public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
this.implClazz = implClazz;
@@ -78,35 +69,20 @@ public abstract class BinaryDictionaryWr
sb.append(part);
}
}
- String pos = sb.toString();
- Integer posIndex = posDictLookup.get(pos);
- if (posIndex == null) {
- posIndex = posDict.size();
- posDict.add(pos);
- posDictLookup.put(pos, posIndex);
- assert posDict.size() == posDictLookup.size();
- }
+
+ String posData = sb.toString();
sb.setLength(0);
- sb.append(CSVUtil.quoteEscape(entry[8]));
+ sb.append(CSVUtil.quoteEscape(posData));
sb.append(',');
- sb.append(CSVUtil.quoteEscape(entry[9]));
- String inflData = sb.toString();
-
- Integer inflIndex = Integer.MAX_VALUE;
- int hasInflData;
- if ("*,*".equals(inflData)) {
- hasInflData = 0; // no inflection data
- } else {
- hasInflData = 1;
- inflIndex = inflDictLookup.get(inflData);
- if (inflIndex == null) {
- inflIndex = inflDict.size();
- inflDict.add(inflData);
- inflDictLookup.put(inflData, inflIndex);
- assert inflDict.size() == inflDictLookup.size();
- }
+ if (!"*".equals(entry[8])) {
+ sb.append(CSVUtil.quoteEscape(entry[8]));
}
+ sb.append(',');
+ if (!"*".equals(entry[9])) {
+ sb.append(CSVUtil.quoteEscape(entry[9]));
+ }
+ String fullPOSData = sb.toString();
String baseForm = entry[10];
String reading = entry[11];
@@ -114,28 +90,40 @@ public abstract class BinaryDictionaryWr
// extend buffer if necessary
int left = buffer.remaining();
- // worst case: three short, 4 bytes, one vint and features (all as utf-16)
- int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
+ // worst case: two short, 3 bytes, and features (all as utf-16)
+ int worstCase = 4 + 3 + 2*(baseForm.length() + reading.length() + pronunciation.length());
if (worstCase > left) {
ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
buffer.flip();
newBuffer.put(buffer);
buffer = newBuffer;
}
+
+ int flags = 0;
+ if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
+ flags |= BinaryDictionary.HAS_BASEFORM;
+ }
+ if (!pronunciation.equals(reading)) {
+ flags |= BinaryDictionary.HAS_PRONUNCIATION;
+ }
+
+ assert leftId == rightId;
+ assert leftId < 8192; // there are still unused bits
+ // add pos mapping
+ int toFill = 1+leftId - posDict.size();
+ for (int i = 0; i < toFill; i++) {
+ posDict.add(null);
+ }
- buffer.putShort(leftId);
- buffer.putShort(rightId);
- buffer.putShort(wordCost);
- assert posIndex.intValue() < 128;
- buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
-
- int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
+ String existing = posDict.get(leftId);
+ assert existing == null || existing.equals(fullPOSData);
+ posDict.set(leftId, fullPOSData);
- if ("*".equals(baseForm) || baseForm.equals(entry[0])) {
- buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
- } else {
- assert baseForm.length() < 128;
- buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
+ buffer.putShort((short)(leftId << 2 | flags));
+ buffer.putShort(wordCost);
+
+ if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
+ buffer.put((byte) baseForm.length());
for (int i = 0; i < baseForm.length(); i++) {
buffer.putChar(baseForm.charAt(i));
}
@@ -151,7 +139,7 @@ public abstract class BinaryDictionaryWr
}
}
- if (pronunciationIsReading == 0) {
+ if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
if (isKatakana(pronunciation)) {
buffer.put((byte) (pronunciation.length() << 1 | 1));
writeKatakana(pronunciation);
@@ -163,17 +151,6 @@ public abstract class BinaryDictionaryWr
}
}
- if (hasInflData > 0) {
- int key = inflIndex.intValue();
- assert key < 32768; // note there are really like 300 of these...
- if (key < 128) {
- buffer.put((byte) key);
- } else {
- buffer.put((byte) ((key & 0x7f) | 0x80));
- buffer.put((byte) (key >>> 7));
- }
- }
-
return buffer.position();
}
@@ -229,7 +206,6 @@ public abstract class BinaryDictionaryWr
writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
- writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
}
// TODO: maybe this int[] should instead be the output to the FST...
@@ -271,26 +247,17 @@ public abstract class BinaryDictionaryWr
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(posDict.size());
for (String s : posDict) {
- out.writeString(s);
- }
- } finally {
- os.close();
- }
- }
-
- protected void writeInflDict(String filename) throws IOException {
- new File(filename).getParentFile().mkdirs();
- OutputStream os = new FileOutputStream(filename);
- try {
- os = new BufferedOutputStream(os);
- final DataOutput out = new OutputStreamDataOutput(os);
- CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
- out.writeVInt(inflDict.size());
- for (String s : inflDict) {
- String data[] = CSVUtil.parse(s);
- assert data.length == 2 : "malformed inflection: " + s;
- out.writeString(data[0]);
- out.writeString(data[1]);
+ if (s == null) {
+ out.writeByte((byte)0);
+ out.writeByte((byte)0);
+ out.writeByte((byte)0);
+ } else {
+ String data[] = CSVUtil.parse(s);
+ assert data.length == 3 : "malformed pos/inflection: " + s;
+ out.writeString(data[0]);
+ out.writeString(data[1]);
+ out.writeString(data[2]);
+ }
}
} finally {
os.close();
@@ -313,59 +280,4 @@ public abstract class BinaryDictionaryWr
os.close();
}
}
-
- // TODO: the below is messy, but makes the dictionary smaller.
- // we track frequencies of inflections so the highest-freq ones have smaller indexes.
-
- /** optional: notes inflection seen in the data up front */
- public void noteInflection(String entry[]) {
- StringBuilder sb = new StringBuilder();
- sb.append(CSVUtil.quoteEscape(entry[8]));
- sb.append(',');
- sb.append(CSVUtil.quoteEscape(entry[9]));
- String s = sb.toString();
- if ("*,*".equals(s)) {
- return; // no inflection data
- }
- Integer freq = notedInflections.get(s);
- if (freq == null) {
- freq = 0;
- }
- notedInflections.put(s, freq+1);
- }
-
- /** prepopulates inflection mapping by frequency */
- public void finalizeInflections() {
- InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
- int upto = 0;
- for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
- freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
- }
- Arrays.sort(freqs, Collections.reverseOrder());
- for (int i = 0; i < upto; i++) {
- inflDict.add(freqs[i].inflection);
- inflDictLookup.put(freqs[i].inflection, i);
- }
- }
-
- static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
- String inflection;
- int freq;
-
- InflectionAndFreq(String s, int i) {
- this.inflection = s;
- this.freq = i;
- }
-
- public int compareTo(InflectionAndFreq other) {
- int cmp = freq - other.freq;
- if (cmp == 0) {
- return inflection.compareTo(other.inflection);
- } else {
- return cmp;
- }
- }
- }
-
- private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Mon Jan 16 19:22:50 2012
@@ -101,7 +101,6 @@ public class TokenInfoDictionaryBuilder
}
String[] formatted = formatEntry(entry);
- dictionary.noteInflection(formatted);
lines.add(formatted);
// NFKC normalize dictionary entry
@@ -115,14 +114,11 @@ public class TokenInfoDictionaryBuilder
}
formatted = formatEntry(normalizedEntry);
- dictionary.noteInflection(formatted);
lines.add(formatted);
}
}
}
- dictionary.finalizeInflections();
-
System.out.println(" sort...");
// sort by term: we sorted the files already and use a stable sort.
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1232122&r1=1232121&r2=1232122&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Mon Jan 16 19:22:50 2012
@@ -33,7 +33,7 @@ import java.util.List;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
- private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
+ private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,è¨å·,ä¸è¬,*,*,*,*,*,*,*";
private String encoding = "euc-jp";
@@ -73,9 +73,7 @@ public class UnknownDictionaryBuilder {
// even though the unknown dictionary returns hardcoded null here.
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
lines.add(parsed);
- dictionary.noteInflection(parsed); // for completeness; I think unk.def has no inflections...
}
- dictionary.finalizeInflections(); // should also be no-op
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {