You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/09 04:33:04 UTC
svn commit: r1229020 -
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
Author: rmuir
Date: Mon Jan 9 03:33:04 2012
New Revision: 1229020
URL: http://svn.apache.org/viewvc?rev=1229020&view=rev
Log:
LUCENE-3305: clean up construction code a bit
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1229020&r1=1229019&r2=1229020&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Mon Jan 9 03:33:04 2012
@@ -28,16 +28,10 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
import org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
@@ -46,7 +40,6 @@ import org.apache.lucene.util.fst.Positi
import com.ibm.icu.text.Normalizer2;
-
/**
*/
public class TokenInfoDictionaryBuilder {
@@ -54,8 +47,6 @@ public class TokenInfoDictionaryBuilder
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
- private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
-
private String encoding = "euc-jp";
private boolean normalizeEntries = false;
@@ -66,7 +57,6 @@ public class TokenInfoDictionaryBuilder
public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
this.format = format;
this.encoding = encoding;
- this.dictionaryEntries = new TreeMap<Integer, String>();
this.normalizeEntries = normalizeEntries;
this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
}
@@ -90,7 +80,8 @@ public class TokenInfoDictionaryBuilder
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
// all lines in the file
- List<String[]> lines = new ArrayList<String[]>();
+ System.out.println(" parse...");
+ List<String[]> lines = new ArrayList<String[]>(400000);
for (File file : csvFiles){
FileInputStream inputStream = new FileInputStream(file);
Charset cs = Charset.forName(encoding);
@@ -124,6 +115,8 @@ public class TokenInfoDictionaryBuilder
}
}
+ System.out.println(" sort...");
+
// sort by term
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {
@@ -131,6 +124,15 @@ public class TokenInfoDictionaryBuilder
}
});
+ System.out.println(" encode...");
+
+ PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
+ Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
+ IntsRef scratch = new IntsRef();
+ long ord = -1; // first ord will be 0
+ String lastValue = null;
+
+ // build tokeninfo dictionary
for (String[] entry : lines) {
int next = dictionary.put(entry);
@@ -138,55 +140,30 @@ public class TokenInfoDictionaryBuilder
System.out.println("Failed to process line: " + Arrays.toString(entry));
continue;
}
-
- dictionaryEntries.put(offset, entry[0]);
+
+ String token = entry[0];
+ if (!token.equals(lastValue)) {
+ // new word to add to fst
+ ord++;
+ lastValue = token;
+ scratch.grow(token.length());
+ scratch.length = token.length();
+ for (int i = 0; i < token.length(); i++) {
+ scratch.ints[i] = (int) token.charAt(i);
+ }
+ fstBuilder.add(scratch, fstOutput.get(ord));
+ }
+ dictionary.addMapping((int)ord, offset);
offset = next;
}
- // TODO: we can do this in parallel
- System.out.print(" building FST...");
- FST<Long> fst = buildFST();
+ FST<Long> fst = fstBuilder.finish();
+ System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
dictionary.setFST(fst);
System.out.println(" done");
- System.out.print(" processing target map...");
- TokenInfoFST lookup = new TokenInfoFST(fst, false);
- assert fst != null;
- for (Entry<Integer, String> entry : entrySet()) {
- int tokenInfoId = entry.getKey();
- String surfaceform = entry.getValue();
- int fstId = lookupOrd(lookup, surfaceform);
- dictionary.addMapping(fstId, tokenInfoId);
- }
-
- System.out.println(" done");
-
return dictionary;
}
-
- public int lookupOrd(TokenInfoFST fst, String word) throws IOException {
- final FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
- // Accumulate output as we go
- final Long NO_OUTPUT = fst.NO_OUTPUT;
- Long output = NO_OUTPUT;
- for (int i = 0; i < word.length(); i++) {
- int ch = word.charAt(i);
- if (fst.findTargetArc(ch, arc, arc, i == 0) == null) {
- assert false;
- return -1;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.addOutput(output, arc.output);
- }
- }
- if (fst.findTargetArc(FST.END_LABEL, arc, arc, false) == null) {
- assert false;
- return -1;
- } else if (arc.output != NO_OUTPUT) {
- return fst.addOutput(output, arc.output).intValue();
- } else {
- return output.intValue();
- }
- }
/*
* IPADIC features
@@ -242,31 +219,4 @@ public class TokenInfoDictionaryBuilder
return features2;
}
}
-
- private Set<Entry<Integer, String>> entrySet() {
- return dictionaryEntries.entrySet();
- }
-
- private FST<Long> buildFST() throws IOException {
- FST<Long> words;
- Collection<String> values = dictionaryEntries.values();
- // TODO: we don't need to sort again, we could just check != last
- TreeSet<String> unique = new TreeSet<String>(values);
- PositiveIntOutputs o = PositiveIntOutputs.getSingleton(true);
- Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE2, o);
- IntsRef scratch = new IntsRef();
- long ord = 0;
- for (String entry : unique) {
- scratch.grow(entry.length());
- scratch.length = entry.length();
- for (int i = 0; i < entry.length(); i++) {
- scratch.ints[i] = (int) entry.charAt(i);
- }
- b.add(scratch, o.get(ord));
- ord++;
- }
- words = b.finish();
- System.out.print(" " + words.getNodeCount() + " nodes, " + words.getArcCount() + " arcs, " + words.sizeInBytes() + " bytes... ");
- return words;
- }
}