You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/06 19:37:10 UTC
svn commit: r1228334 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucene/anal...
Author: rmuir
Date: Fri Jan 6 18:37:10 2012
New Revision: 1228334
URL: http://svn.apache.org/viewvc?rev=1228334&view=rev
Log:
LUCENE-3305: speed up tokenization by not creating strings
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java Fri Jan 6 18:37:10 2012
@@ -49,7 +49,7 @@ public class DebugTokenizer {
}
public String debugTokenize(String text) {
- ViterbiNode[][][] lattice = this.viterbi.build(text);
+ ViterbiNode[][][] lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
List<ViterbiNode> bestPath = this.viterbi.search(lattice);
return this.formatter.format(lattice[0], lattice[1], bestPath);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Fri Jan 6 18:37:10 2012
@@ -48,8 +48,8 @@ public final class KuromojiTokenizer ext
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
- // TODO: allow the tokenizer, at least maybe doTokenize to take char[] or charsequence or characteriterator?
- tokens = tokenizer.tokenize(new String(buffer, sentenceStart, sentenceEnd-sentenceStart));
+ // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
+ tokens = tokenizer.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart);
tokenIndex = 0;
}
@@ -59,10 +59,8 @@ public final class KuromojiTokenizer ext
return false;
}
Token token = tokens.get(tokenIndex);
- // TODO: we don't really need the surface form except for its length? (its in the buffer already)
- String surfaceForm = token.getSurfaceForm();
int position = token.getPosition();
- int length = surfaceForm.length();
+ int length = token.getLength();
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart + position, length);
int startOffset = offset + sentenceStart + position;
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Fri Jan 6 18:37:10 2012
@@ -20,22 +20,24 @@ package org.apache.lucene.analysis.kurom
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
-// TODO: somehow this thing needs to keep state, so that once it decodes metadata
-// it never does it again.
public class Token {
private final Dictionary dictionary;
private final int wordId;
- private final String surfaceForm;
+ private final char[] surfaceForm;
+ private final int offset;
+ private final int length;
private final int position;
private final Type type;
- public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
+ public Token(int wordId, char[] surfaceForm, int offset, int length, Type type, int position, Dictionary dictionary) {
this.wordId = wordId;
this.surfaceForm = surfaceForm;
+ this.offset = offset;
+ this.length = length;
this.type = type;
this.position = position;
this.dictionary = dictionary;
@@ -44,11 +46,32 @@ public class Token {
/**
* @return surfaceForm
*/
- public String getSurfaceForm() {
+ public char[] getSurfaceForm() {
return surfaceForm;
}
/**
+ * @return offset into surfaceForm
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * @return length of surfaceForm
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * @return surfaceForm as a String
+ */
+ public String getSurfaceFormString() {
+ return new String(surfaceForm, offset, length);
+ }
+
+ /**
* @return reading. null if token doesn't have reading.
*/
public String getReading() {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Fri Jan 6 18:37:10 2012
@@ -129,23 +129,28 @@ public class Tokenizer {
return splitPositions;
}
+ private List<Token> doTokenize(int offset, String sentence) {
+ char text[] = sentence.toCharArray();
+ return doTokenize(offset, text, 0, text.length);
+ }
+
/**
* Tokenize input sentence.
* @param offset offset of sentence in original input text
* @param sentence sentence to tokenize
* @return list of Token
*/
- private List<Token> doTokenize(int offset, String sentence) {
+ public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength) {
ArrayList<Token> result = new ArrayList<Token>();
- ViterbiNode[][][] lattice = viterbi.build(sentence);
+ ViterbiNode[][][] lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
continue;
}
- Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
+ Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
}
@@ -167,7 +172,9 @@ public class Tokenizer {
private Mode mode = Mode.NORMAL;
- private boolean split = false;
+ // this is true, for other use.
+ // lucene's tokenizer uses a breakiterator and doTokenize directly.
+ private boolean split = true;
private UserDictionary userDictionary = null;
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Fri Jan 6 18:37:10 2012
@@ -27,16 +27,16 @@ public final class UnknownDictionary ext
super();
}
- public int lookup(String text) {
- if(!characterDefinition.isGroup(text.charAt(0))) {
+ public int lookup(char[] text, int offset, int len) {
+ if(!characterDefinition.isGroup(text[offset])) {
return 1;
}
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
- byte characterIdOfFirstCharacter = characterDefinition.getCharacterClass(text.charAt(0));
+ byte characterIdOfFirstCharacter = characterDefinition.getCharacterClass(text[offset]);
int length = 1;
- for (int i = 1; i < text.length(); i++) {
- if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text.charAt(i))){
+ for (int i = 1; i < len; i++) {
+ if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text[offset+i])){
length++;
} else {
break;
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Fri Jan 6 18:37:10 2012
@@ -49,10 +49,14 @@ public class UserDictionary implements D
/**
* Lookup words in text
- * @param text
+ * @param chars text
+ * @param off offset into text
+ * @param len length of text
* @return array of {wordId, position, length}
*/
- public int[][] lookup(String text) {
+ public int[][] lookup(char[] chars, int off, int len) {
+ // TODO: this method should be more efficient.
+ String text = new String(chars, off, len);
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
for (String keyword : entries.descendingKeySet()) {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java Fri Jan 6 18:37:10 2012
@@ -180,14 +180,14 @@ public final class DoubleArrayTrie {
* @param key key to match
* @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
*/
- public int lookup(String key) {
+ public int lookup(char key[], int offset, int length) {
int index = 0;
int base = 1; // base at index 0 should be 1
- int keyLength = key.length();
- for(int i = 0; i < keyLength; i++) {
+ int end = offset + length;
+ for(int i = offset; i < end; i++) {
int previous = index;
- index = index + base + key.charAt(i);
+ index = index + base + key[i];
if(index > baseBuffer.limit()) { // Too long
return -1;
@@ -204,7 +204,8 @@ public final class DoubleArrayTrie {
}
if(base >= TAIL_OFFSET) { // If base is bigger than TAIL_OFFSET, start processing "tail"
- return matchTail(base, index, key.substring(i + 1));
+ int newOffset = i + 1;
+ return matchTail(base, index, key, newOffset, end - newOffset);
}
}
@@ -222,16 +223,15 @@ public final class DoubleArrayTrie {
* @param key
* @return index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
*/
- private int matchTail(int base, int index, String key) {
+ private int matchTail(int base, int index, char key[], int offset, int length) {
int positionInTailArr = base - TAIL_OFFSET;
- int keyLength = key.length();
- for(int i = 0; i < keyLength; i++) {
- if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
+ for(int i = 0; i < length; i++) {
+ if(key[offset + i] != tailBuffer.get(positionInTailArr + i)){
return -1;
}
}
- return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
+ return tailBuffer.get(positionInTailArr + length) == TERMINATING_CHARACTER ? index : 0;
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java Fri Jan 6 18:37:10 2012
@@ -216,7 +216,7 @@ public class GraphvizFormatter {
return BOS_LABEL;
}
} else {
- return node.getSurfaceForm();
+ return node.getSurfaceFormString();
}
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Fri Jan 6 18:37:10 2012
@@ -57,9 +57,9 @@ public class Viterbi {
private static final int SEARCH_MODE_PENALTY = 10000;
- private static final String BOS = "BOS";
+ private static final char[] BOS = "BOS".toCharArray();
- private static final String EOS = "EOS";
+ private static final char[] EOS = "EOS".toCharArray();
/**
* Constructor
@@ -131,13 +131,14 @@ public class Viterbi {
// "Search mode". Add extra costs if it is long node.
if (searchMode) {
// System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
- String surfaceForm = node.getSurfaceForm();
- int length = surfaceForm.length();
+ char[] surfaceForm = node.getSurfaceForm();
+ int offset = node.getOffset();
+ int length = node.getLength();
if (length > SEARCH_MODE_LENGTH_KANJI) {
boolean allKanji = true;
// check if node consists of only kanji
for (int pos = 0; pos < length; pos++) {
- if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
+ if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
allKanji = false;
break;
}
@@ -176,9 +177,11 @@ public class Viterbi {
int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
- String surfaceForm = leftNode.getSurfaceForm();
- for (int i = surfaceForm.length(); i > 0; i--) {
- ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
+ char[] surfaceForm = leftNode.getSurfaceForm();
+ int offset = leftNode.getOffset();
+ int length = leftNode.getLength();
+ for (int i = length; i > 0; i--) {
+ ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i - 1, 1, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
result.addFirst(uniGramNode);
}
} else {
@@ -195,41 +198,40 @@ public class Viterbi {
* Build lattice from input text
* @param text
*/
- public ViterbiNode[][][] build(String text) {
- int textLength = text.length();
- ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS
- ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS
- int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
- int[] endSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in endIndexArr
+ public ViterbiNode[][][] build(char text[], int offset, int length) {
+ ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
+ ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
+ int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
+ int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr
- ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
+ ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
// Process user dictionary;
if (useUserDictionary) {
- processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
int unknownWordEndIndex = -1; // index of the last character of unknown word
- for (int startIndex = 0; startIndex < textLength; startIndex++) {
+ for (int startIndex = 0; startIndex < length; startIndex++) {
// If no token ends where current token starts, skip this index
if (endSizeArr[startIndex + 1] == 0) {
continue;
}
- String suffix = text.substring(startIndex);
+ int suffixStart = offset + startIndex;
+ int suffixLength = length - startIndex;
boolean found = false;
- for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
- String prefix = suffix.substring(0, endIndex);
+ for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
- int result = trie.lookup(prefix);
+ int result = trie.lookup(text, suffixStart, endIndex);
if (result > 0) { // Found match in double array trie
found = true; // Don't produce unknown word starting from this index
for (int wordId : dictionary.lookupWordIds(result)) {
- ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
+ ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
} else if(result < 0) { // If result is less than zero, continue to next position
@@ -242,31 +244,30 @@ public class Viterbi {
continue;
}
- // Process Unknown Word
+ // Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
int unknownWordLength = 0;
- char firstCharacter = suffix.charAt(0);
+ char firstCharacter = text[suffixStart];
boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
if (isInvoke){ // Process "invoke"
- unknownWordLength = unkDictionary.lookup(suffix);
+ unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
} else if (found == false){ // Process not "invoke"
- unknownWordLength = unkDictionary.lookup(suffix);
+ unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
}
if (unknownWordLength > 0) { // found unknown word
- String unkWord = suffix.substring(0, unknownWordLength);
int characterId = characterDefinition.getCharacterClass(firstCharacter);
int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same
for (int wordId : wordIds) {
- ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
+ ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
unknownWordEndIndex = startIndex + unknownWordLength;
}
}
- ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
- addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
+ ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
+ addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
@@ -281,13 +282,13 @@ public class Viterbi {
* @param startSizeArr
* @param endSizeArr
*/
- private void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
- int[][] result = userDictionary.lookup(text);
+ private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
+ int[][] result = userDictionary.lookup(text, offset, len);
for(int[] segmentation : result) {
int wordId = segmentation[0];
int index = segmentation[1];
int length = segmentation[2];
- ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
+ ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java Fri Jan 6 18:37:10 2012
@@ -26,7 +26,9 @@ public final class ViterbiNode {
private final int wordId;
- private final String surfaceForm;
+ private final char[] surfaceForm;
+ private final int offset;
+ private final int length;
private final int leftId;
@@ -44,9 +46,11 @@ public final class ViterbiNode {
private final int startIndex;
- public ViterbiNode(int wordId, String surfaceForm, int leftId, int rightId, int wordCost, int startIndex, Type type) {
+ public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
this.wordId = wordId;
this.surfaceForm = surfaceForm;
+ this.offset = offset;
+ this.length = length;
this.leftId = leftId;
this.rightId = rightId;
this.wordCost = wordCost;
@@ -65,11 +69,32 @@ public final class ViterbiNode {
/**
* @return the surfaceForm
*/
- public String getSurfaceForm() {
+ public char[] getSurfaceForm() {
return surfaceForm;
}
/**
+ * @return start offset into surfaceForm
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * @return length of surfaceForm
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * @return the surfaceForm as a String
+ */
+ public String getSurfaceFormString() {
+ return new String(surfaceForm, offset, length);
+ }
+
+ /**
* @return the leftId
*/
public int getLeftId() {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Fri Jan 6 18:37:10 2012
@@ -59,7 +59,7 @@ public class TokenizerTest extends Lucen
List<Token> tokens = tokenizer.tokenize(input);
assertTrue(tokens.size() == surfaceForms.length);
for (int i = 0; i < tokens.size(); i++) {
- assertEquals(surfaceForms[i], tokens.get(i).getSurfaceForm());
+ assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
}
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Fri Jan 6 18:37:10 2012
@@ -30,7 +30,8 @@ public class UserDictionaryTest extends
public void testLookup() throws IOException {
UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
- int[][] dictionaryEntryResult = dictionary.lookup("é¢è¥¿å½é空港ã«è¡ã£ã");
+ String s = "é¢è¥¿å½é空港ã«è¡ã£ã";
+ int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be three é¢è¥¿, å½é, 空港
assertEquals(3, dictionaryEntryResult.length);
@@ -44,7 +45,8 @@ public class UserDictionaryTest extends
assertEquals(2, dictionaryEntryResult[1][2]); // length of å½é
assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
- int[][] dictionaryEntryResult2 = dictionary.lookup("é¢è¥¿å½é空港ã¨é¢è¥¿å½é空港ã«è¡ã£ã");
+ s = "é¢è¥¿å½é空港ã¨é¢è¥¿å½é空港ã«è¡ã£ã";
+ int[][] dictionaryEntryResult2 = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be six
assertEquals(6, dictionaryEntryResult2.length);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java Fri Jan 6 18:37:10 2012
@@ -32,10 +32,10 @@ public class DoubleArrayTrieTest extends
public void test() {
Trie trie = getTrie();
DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie(trie);
- assertEquals(0, doubleArrayTrie.lookup("a"));
- assertTrue(doubleArrayTrie.lookup("abc") > 0);
- assertTrue(doubleArrayTrie.lookup("ããã") > 0);
- assertTrue(doubleArrayTrie.lookup("xyz") < 0);
+ assertEquals(0, doubleArrayTrie.lookup("a".toCharArray(), 0, 1));
+ assertTrue(doubleArrayTrie.lookup("abc".toCharArray(), 0, 3) > 0);
+ assertTrue(doubleArrayTrie.lookup("ããã".toCharArray(), 0, 3) > 0);
+ assertTrue(doubleArrayTrie.lookup("xyz".toCharArray(), 0, 3) < 0);
}
private Trie getTrie() {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java Fri Jan 6 18:37:10 2012
@@ -48,7 +48,7 @@ public class DictionaryBuilder {
for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
int tokenInfoId = entry.getKey();
String surfaceform = entry.getValue();
- int doubleArrayId = trie.lookup(surfaceform);
+ int doubleArrayId = trie.lookup(surfaceform.toCharArray(), 0, surfaceform.length());
assert doubleArrayId > 0;
tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
}