You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/09 00:47:59 UTC
svn commit: r1228994 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/dict/
resources/org/apache/lucene/analysis/kuromoji/dict/
tools/java/org/apache/lucene/analysis/kuromoji/util/ tools/test/...
Author: rmuir
Date: Sun Jan 8 23:47:58 2012
New Revision: 1228994
URL: http://svn.apache.org/viewvc?rev=1228994&view=rev
Log:
LUCENE-3305: sort all input to make the targetmap always increasing list of deltas (~1MB smaller jar)
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Sun Jan 8 23:47:58 2012
@@ -60,21 +60,22 @@ public abstract class BinaryDictionary i
DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
targetMap = new int[in.readVInt()][];
- for (int j = 0; j < targetMap.length;) {
+ int accum = 0;
+ for (int j = 0; j < targetMap.length; j++) {
final int len = in.readVInt();
- if (len == 0) {
- // decode RLE: number of nulls
- j += in.readVInt();
+ final int a[];
+ if ((len & 1) == 1) {
+ a = new int[1];
+ accum += len >>> 1;
+ a[0] = accum;
} else {
- final int[] a = new int[len];
- int accum = 0;
- for (int i = 0; i < len; i++) {
+ a = new int[len >>> 1];
+ for (int i = 0; i < a.length; i++) {
accum += in.readVInt();
a[i] = accum;
}
- targetMap[j] = a;
- j++;
}
+ targetMap[j] = a;
}
posIS = getClass().getResourceAsStream(getClass().getSimpleName() + POSDICT_FILENAME_SUFFIX);
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Sun Jan 8 23:47:58 2012
@@ -158,7 +158,16 @@ public abstract class BinaryDictionaryWr
}
}
+ // only for assert
+ int lastWordId = -1;
+ int lastSourceId = 0;
+
public void addMapping(int sourceId, int wordId) {
+ assert sourceId >= lastSourceId;
+ assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
+ lastSourceId = sourceId;
+ lastWordId = wordId;
+
if(targetMap.length <= sourceId) {
final int newSize = ArrayUtil.oversize(sourceId + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
int[][] newArray = new int[newSize][];
@@ -211,35 +220,27 @@ public abstract class BinaryDictionaryWr
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
out.writeVInt(targetMapSize);
- int nulls = 0;
+ int prev = 0;
for (int j = 0; j < targetMapSize; j++) {
final int size = targetMapComponentSizes[j];
- if (size == 0) {
- // run-length encoding for all nulls:
- if (nulls == 0) {
- out.writeVInt(0);
- }
- nulls++;
+ // note: size is 0 for ONLY wordID 0 of TokenInfoDictionary
+ // this is because the FST uses 0 for NO_OUTPUT...
+ if (size == 1) {
+ int delta = targetMap[j][0] - prev;
+ assert delta >= 0;
+ out.writeVInt(delta << 1 | 1);
+ prev += delta;
} else {
- if (nulls > 0) {
- out.writeVInt(nulls);
- nulls = 0;
- }
+ out.writeVInt(size << 1);
final int[] a = targetMap[j];
- Arrays.sort(a, 0, size);
- assert size > 0 && size <= a.length;
- out.writeVInt(size);
- int prev = 0;
for (int i = 0; i < size; i++) {
- out.writeVInt(a[i] - prev);
- prev = a[i];
+ int delta = a[i] - prev;
+ assert delta >= 0;
+ out.writeVInt(delta);
+ prev += delta;
}
}
}
- // write the pending RLE count:
- if (nulls > 0) {
- out.writeVInt(nulls);
- }
} finally {
os.close();
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Sun Jan 8 23:47:58 2012
@@ -27,8 +27,10 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.Comparator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
@@ -87,6 +89,8 @@ public class TokenInfoDictionaryBuilder
public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
+ // all lines in the file
+ List<String[]> lines = new ArrayList<String[]>();
for (File file : csvFiles){
FileInputStream inputStream = new FileInputStream(file);
Charset cs = Charset.forName(encoding);
@@ -103,15 +107,7 @@ public class TokenInfoDictionaryBuilder
System.out.println("Entry in CSV is not valid: " + line);
continue;
}
- int next = dictionary.put(formatEntry(entry));
-
- if(next == offset){
- System.out.println("Failed to process line: " + line);
- continue;
- }
-
- dictionaryEntries.put(offset, entry[0]);
- offset = next;
+ lines.add(formatEntry(entry));
// NFKC normalize dictionary entry
if (normalizeEntries) {
@@ -122,14 +118,32 @@ public class TokenInfoDictionaryBuilder
for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = normalizer.normalize(entry[i]);
}
-
- next = dictionary.put(formatEntry(normalizedEntry));
- dictionaryEntries.put(offset, normalizedEntry[0]);
- offset = next;
+
+ lines.add(formatEntry(normalizedEntry));
}
}
}
+ // sort by term
+ Collections.sort(lines, new Comparator<String[]>() {
+ public int compare(String[] left, String[] right) {
+ return left[0].compareTo(right[0]);
+ }
+ });
+
+ for (String[] entry : lines) {
+ int next = dictionary.put(entry);
+
+ if(next == offset){
+ System.out.println("Failed to process line: " + Arrays.toString(entry));
+ continue;
+ }
+
+ dictionaryEntries.put(offset, entry[0]);
+ offset = next;
+ }
+
+ // TODO: we can do this in parallel
System.out.print(" building FST...");
FST<Long> fst = buildFST();
dictionary.setFST(fst);
@@ -236,6 +250,7 @@ public class TokenInfoDictionaryBuilder
private FST<Long> buildFST() throws IOException {
FST<Long> words;
Collection<String> values = dictionaryEntries.values();
+ // TODO: we don't need to sort again, we could just check != last
TreeSet<String> unique = new TreeSet<String>(values);
PositiveIntOutputs o = PositiveIntOutputs.getSingleton(true);
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE2, o);
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Sun Jan 8 23:47:58 2012
@@ -25,6 +25,12 @@ import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
@@ -60,11 +66,24 @@ public class UnknownDictionaryBuilder {
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
+ List<String[]> lines = new ArrayList<String[]>();
String line = null;
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
- dictionary.put(CSVUtil.parse(line + ",*,*")); // Probably we don't need to validate entry
+ lines.add(CSVUtil.parse(line + ",*,*")); // Probably we don't need to validate entry
+ }
+
+ Collections.sort(lines, new Comparator<String[]>() {
+ public int compare(String[] left, String[] right) {
+ int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
+ int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
+ return leftId - rightId;
+ }
+ });
+
+ for (String[] entry : lines) {
+ dictionary.put(entry);
}
return dictionary;
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java?rev=1228994&r1=1228993&r2=1228994&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java Sun Jan 8 23:47:58 2012
@@ -59,14 +59,14 @@ public class UnknownDictionaryTest exten
} catch(Exception e){
}
-
- String entry1 = "KANJI,1285,1285,11426,åè©,ä¸è¬,*,*,*,*,*,*,*";
- String entry2 = "ALPHA,1285,1285,13398,åè©,ä¸è¬,*,*,*,*,*,*,*";
- String entry3 = "HIRAGANA,1285,1285,13069,åè©,ä¸è¬,*,*,*,*,*,*,*";
-
- unkDic.putCharacterCategory(0, "KANJI");
- unkDic.putCharacterCategory(1, "ALPHA");
- unkDic.putCharacterCategory(2, "HIRAGANA");
+
+ String entry1 = "ALPHA,1285,1285,13398,åè©,ä¸è¬,*,*,*,*,*,*,*";
+ String entry2 = "HIRAGANA,1285,1285,13069,åè©,ä¸è¬,*,*,*,*,*,*,*";
+ String entry3 = "KANJI,1285,1285,11426,åè©,ä¸è¬,*,*,*,*,*,*,*";
+
+ unkDic.putCharacterCategory(0, "ALPHA");
+ unkDic.putCharacterCategory(1, "HIRAGANA");
+ unkDic.putCharacterCategory(2, "KANJI");
unkDic.put(CSVUtil.parse(entry1));
unkDic.put(CSVUtil.parse(entry2));