You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/09 09:10:38 UTC
svn commit: r1229052 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
resources/org/apache/lucene/analysis/kuromoji/dict/ test/org/apache/lucene...
Author: rmuir
Date: Mon Jan 9 08:10:37 2012
New Revision: 1229052
URL: http://svn.apache.org/viewvc?rev=1229052&view=rev
Log:
LUCENE-3305: add inflection data
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat (with props)
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Mon Jan 9 08:10:37 2012
@@ -93,6 +93,20 @@ public class Token {
}
/**
+ * @return inflection type or null
+ */
+ public String getInflectionType() {
+ return dictionary.getInflectionType(wordId);
+ }
+
+ /**
+ * @return inflection form or null
+ */
+ public String getInflectionForm() {
+ return dictionary.getInflectionForm(wordId);
+ }
+
+ /**
* @return base form or null if token is not inflected
*/
public String getBaseForm() {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Mon Jan 9 08:10:37 2012
@@ -37,21 +37,27 @@ public abstract class BinaryDictionary i
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
+ public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
public static final String DICT_HEADER = "kuromoji_dict";
public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
+ public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
public static final int VERSION = 1;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final String[] posDict;
+ private final String[] inflTypeDict;
+ private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
- InputStream mapIS = null, dictIS = null, posIS = null;
+ InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
IOException priorE = null;
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
+ String[] inflFormDict = null;
+ String[] inflTypeDict = null;
ByteBuffer buffer = null;
try {
mapIS = getClass().getResourceAsStream(getClass().getSimpleName() + TARGETMAP_FILENAME_SUFFIX);
@@ -86,6 +92,20 @@ public abstract class BinaryDictionary i
for (int j = 0; j < posDict.length; j++) {
posDict[j] = in.readString();
}
+
+ inflIS = getClass().getResourceAsStream(getClass().getSimpleName() + INFLDICT_FILENAME_SUFFIX);
+ if (inflIS == null)
+ throw new FileNotFoundException("Not in classpath: " + getClass().getName().replace('.','/') + INFLDICT_FILENAME_SUFFIX);
+ inflIS = new BufferedInputStream(inflIS);
+ in = new InputStreamDataInput(inflIS);
+ CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
+ int length = in.readVInt();
+ inflTypeDict = new String[length];
+ inflFormDict = new String[length];
+ for (int j = 0; j < length; j++) {
+ inflTypeDict[j] = in.readString();
+ inflFormDict[j] = in.readString();
+ }
dictIS = getClass().getResourceAsStream(getClass().getSimpleName() + DICT_FILENAME_SUFFIX);
if (dictIS == null)
@@ -103,12 +123,14 @@ public abstract class BinaryDictionary i
} catch (IOException ioe) {
priorE = ioe;
} finally {
- IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
+ IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
}
this.targetMap = targetMap;
this.targetMapOffsets = targetMapOffsets;
this.posDict = posDict;
+ this.inflTypeDict = inflTypeDict;
+ this.inflFormDict = inflFormDict;
this.buffer = buffer;
}
@@ -134,64 +156,127 @@ public abstract class BinaryDictionary i
return buffer.getShort(wordId + 4); // Skip left id and right id
}
- private String readString(int offset, int length, boolean kana) {
- char text[] = new char[length];
- if (kana) {
- for (int i = 0; i < length; i++) {
- text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
- }
+ @Override
+ public String getBaseForm(int wordId) {
+ int offset = baseFormOffset(wordId);
+ int length = (buffer.get(offset++) & 0xff) >>> 1;
+ if (length == 0) {
+ return null; // same as surface form
} else {
- for (int i = 0; i < length; i++) {
- text[i] = buffer.getChar(offset + (i << 1));
- }
+ return readString(offset, length, false);
}
- return new String(text);
}
@Override
public String getReading(int wordId) {
- int offset = wordId + 7;
- int baseFormLength = buffer.get(offset++) & 0xff;
- offset += baseFormLength << 1;
+ int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
}
@Override
+ public String getPartOfSpeech(int wordId) {
+ int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
+ return posDict[posIndex >>> 1];
+ }
+
+ @Override
public String getPronunciation(int wordId) {
- int offset = wordId + 7;
- int baseFormLength = buffer.get(offset++) & 0xff;
- offset += baseFormLength << 1;
- int readingData = buffer.get(offset++) & 0xff;
- int readingLength = readingData >>> 1;
- int readingOffset = offset;
- if ((readingData & 1) == 0) {
- offset += readingLength << 1;
+ if (hasPronunciationData(wordId)) {
+ int offset = pronunciationOffset(wordId);
+ int pronunciationData = buffer.get(offset++) & 0xff;
+ return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
} else {
- offset += readingLength;
+ return getReading(wordId); // same as the reading
}
- int pronunciationData = buffer.get(offset++) & 0xff;
- if (pronunciationData == 0) {
- return readString(readingOffset, readingLength, (readingData & 1) == 1);
+ }
+
+ @Override
+ public String getInflectionType(int wordId) {
+ int index = getInflectionIndex(wordId);
+ return index < 0 ? null : inflTypeDict[index];
+ }
+
+ @Override
+ public String getInflectionForm(int wordId) {
+ int index = getInflectionIndex(wordId);
+ return index < 0 ? null : inflFormDict[index];
+ }
+
+ private static int posOffset(int wordId) {
+ return wordId + 6;
+ }
+
+ private static int baseFormOffset(int wordId) {
+ return wordId + 7;
+ }
+
+ private int readingOffset(int wordId) {
+ int offset = baseFormOffset(wordId);
+ int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
+ return offset + baseFormLength;
+ }
+
+ private int pronunciationOffset(int wordId) {
+ int offset = readingOffset(wordId);
+ int readingData = buffer.get(offset++) & 0xff;
+ final int readingLength;
+ if ((readingData & 1) == 0) {
+ readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
} else {
- return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
+ readingLength = readingData >>> 1;
}
+ return offset + readingLength;
}
- @Override
- public String getPartOfSpeech(int wordId) {
- int posIndex = buffer.get(wordId + 6) & 0xff; // read index into posDict
- return posDict[posIndex];
+ private boolean hasPronunciationData(int wordId) {
+ int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
+ return (baseFormData & 1) == 0;
}
- @Override
- public String getBaseForm(int wordId) {
- int offset = wordId + 7;
- int length = buffer.get(offset++) & 0xff;
- if (length == 0) {
- return null; // same as surface form
+ private boolean hasInflectionData(int wordId) {
+ int posData = buffer.get(posOffset(wordId)) & 0xff;
+ return (posData & 1) == 1;
+ }
+
+ private int getInflectionIndex(int wordId) {
+ if (!hasInflectionData(wordId)) {
+ return -1; // common case: no inflection data
+ }
+
+ // skip past reading/pronunciation at the end
+ int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
+ int endData = buffer.get(offset++) & 0xff;
+
+ final int endLength;
+ if ((endData & 1) == 0) {
+ endLength = endData & 0xfe; // UTF-16: mask off kana bit
} else {
- return readString(offset, length, false);
+ endLength = endData >>> 1;
}
+
+ offset += endLength;
+
+ byte b = buffer.get(offset++);
+ int i = b & 0x7F;
+ if ((b & 0x80) == 0) return i;
+ b = buffer.get(offset++);
+ i |= (b & 0x7F) << 7;
+ assert ((b & 0x80) == 0);
+ return i;
+ }
+
+ private String readString(int offset, int length, boolean kana) {
+ char text[] = new char[length];
+ if (kana) {
+ for (int i = 0; i < length; i++) {
+ text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
+ }
+ } else {
+ for (int i = 0; i < length; i++) {
+ text[i] = buffer.getChar(offset + (i << 1));
+ }
+ }
+ return new String(text);
}
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Mon Jan 9 08:10:37 2012
@@ -47,7 +47,6 @@ public interface Dictionary {
* @param wordId word ID of token
* @return Part-Of-Speech of the token
*/
- // TODO: split into the type-safe components
public String getPartOfSpeech(int wordId);
/**
@@ -71,6 +70,19 @@ public interface Dictionary {
*/
public String getPronunciation(int wordId);
+ /**
+ * Get inflection type of tokens
+ * @param wordId word ID of token
+ * @return inflection type, or null
+ */
+ public String getInflectionType(int wordId);
+
+ /**
+ * Get inflection form of tokens
+ * @param wordId word ID of token
+ * @return inflection form, or null
+ */
+ public String getInflectionForm(int wordId);
// TODO: maybe we should have a optimal method, a non-typesafe
// 'getAdditionalData' if other dictionaries like unidic have additional data
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Mon Jan 9 08:10:37 2012
@@ -55,6 +55,16 @@ public final class UnknownDictionary ext
return null;
}
+ @Override
+ public String getInflectionType(int wordId) {
+ return null;
+ }
+
+ @Override
+ public String getInflectionForm(int wordId) {
+ return null;
+ }
+
public static UnknownDictionary getInstance() {
return SingletonHolder.INSTANCE;
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Mon Jan 9 08:10:37 2012
@@ -178,6 +178,16 @@ public final class UserDictionary implem
return null; // TODO: add support?
}
+ @Override
+ public String getInflectionType(int wordId) {
+ return null; // TODO: add support?
+ }
+
+ @Override
+ public String getInflectionForm(int wordId) {
+ return null; // TODO: add support?
+ }
+
private String[] getAllFeaturesArray(int wordId) {
String allFeatures = featureEntries.get(wordId);
if(allFeatures == null) {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24inflDict.dat?rev=1229052&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24inflDict.dat?rev=1229052&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Mon Jan 9 08:10:37 2012
@@ -132,6 +132,36 @@ public class SegmenterTest extends Lucen
}
@Test
+ public void testInflectionTypes() {
+ List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
+ assertEquals(9, tokens.size());
+ assertNull(tokens.get(0).getInflectionType());
+ assertNull(tokens.get(1).getInflectionType());
+ assertNull(tokens.get(2).getInflectionType());
+ assertNull(tokens.get(3).getInflectionType());
+ assertNull(tokens.get(4).getInflectionType());
+ assertNull(tokens.get(5).getInflectionType());
+ assertEquals(tokens.get(6).getInflectionType(), "äºæ®µã»ã©è¡");
+ assertEquals(tokens.get(7).getInflectionType(), "ç¹æ®ã»ãã¹");
+ assertNull(tokens.get(8).getInflectionType());
+ }
+
+ @Test
+ public void testInflectionForms() {
+ List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
+ assertEquals(9, tokens.size());
+ assertNull(tokens.get(0).getInflectionForm());
+ assertNull(tokens.get(1).getInflectionForm());
+ assertNull(tokens.get(2).getInflectionForm());
+ assertNull(tokens.get(3).getInflectionForm());
+ assertNull(tokens.get(4).getInflectionForm());
+ assertNull(tokens.get(5).getInflectionForm());
+ assertEquals(tokens.get(6).getInflectionForm(), "é£ç¨å½¢");
+ assertEquals(tokens.get(7).getInflectionForm(), "åºæ¬å½¢");
+ assertNull(tokens.get(8).getInflectionForm());
+ }
+
+ @Test
public void testPartOfSpeech() {
List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
assertEquals(9, tokens.size());
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Mon Jan 9 08:10:37 2012
@@ -27,6 +27,7 @@ import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
@@ -35,11 +36,8 @@ import org.apache.lucene.store.DataOutpu
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
public abstract class BinaryDictionaryWriter {
protected final Class<? extends BinaryDictionary> implClazz;
@@ -49,6 +47,9 @@ public abstract class BinaryDictionaryWr
private int[] targetMapOffsets = new int[8192];
private final List<String> posDict = new ArrayList<String>();
private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
+
+ private final List<String> inflDict = new ArrayList<String>();
+ private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
this.implClazz = implClazz;
@@ -82,8 +83,26 @@ public abstract class BinaryDictionaryWr
assert posDict.size() == posDictLookup.size();
}
- // TODO: what are the parts 9 and 10 that kuromoji does not expose via Token?
- // we need to break all these out (we can structure them inside posdict)
+ sb.setLength(0);
+ sb.append(CSVUtil.quoteEscape(entry[8]));
+ sb.append(',');
+ sb.append(CSVUtil.quoteEscape(entry[9]));
+ String inflData = sb.toString();
+
+ Integer inflIndex = Integer.MAX_VALUE;
+ int hasInflData;
+ if ("*,*".equals(inflData)) {
+ hasInflData = 0; // no inflection data
+ } else {
+ hasInflData = 1;
+ inflIndex = inflDictLookup.get(inflData);
+ if (inflIndex == null) {
+ inflIndex = inflDict.size();
+ inflDict.add(inflData);
+ inflDictLookup.put(inflData, inflIndex);
+ assert inflDict.size() == inflDictLookup.size();
+ }
+ }
String baseForm = entry[10];
String reading = entry[11];
@@ -91,8 +110,8 @@ public abstract class BinaryDictionaryWr
// extend buffer if necessary
int left = buffer.remaining();
- // worst case: three short, 4 bytes and features (all as utf-16)
- int worstCase = 6 + 4 + 2*(baseForm.length() + reading.length() + pronunciation.length());
+ // worst case: three short, 4 bytes, one vint and features (all as utf-16)
+ int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
if (worstCase > left) {
ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
buffer.flip();
@@ -103,13 +122,16 @@ public abstract class BinaryDictionaryWr
buffer.putShort(leftId);
buffer.putShort(rightId);
buffer.putShort(wordCost);
- assert posIndex.intValue() < 256;
- buffer.put(posIndex.byteValue());
+ assert posIndex.intValue() < 128;
+ buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
+
+ int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
if (baseForm.equals(entry[0])) {
- buffer.put((byte)0); // base form is the same as surface form
+ buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
} else {
- buffer.put((byte)baseForm.length());
+ assert baseForm.length() < 128;
+ buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
for (int i = 0; i < baseForm.length(); i++) {
buffer.putChar(baseForm.charAt(i));
}
@@ -125,9 +147,7 @@ public abstract class BinaryDictionaryWr
}
}
- if (pronunciation.equals(reading)) {
- buffer.put((byte)0); // pronunciation is the same as reading
- } else {
+ if (pronunciationIsReading == 0) {
if (isKatakana(pronunciation)) {
buffer.put((byte) (pronunciation.length() << 1 | 1));
writeKatakana(pronunciation);
@@ -139,6 +159,17 @@ public abstract class BinaryDictionaryWr
}
}
+ if (hasInflData > 0) {
+ int key = inflIndex.intValue();
+ assert key < 32768; // note there are really like 300 of these...
+ if (key < 128) {
+ buffer.put((byte) key);
+ } else {
+ buffer.put((byte) ((key & 0x7f) | 0x80));
+ buffer.put((byte) (key >>> 7));
+ }
+ }
+
return buffer.position();
}
@@ -194,6 +225,7 @@ public abstract class BinaryDictionaryWr
writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+ writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
}
// TODO: maybe this int[] should instead be the output to the FST...
@@ -242,6 +274,25 @@ public abstract class BinaryDictionaryWr
}
}
+ protected void writeInflDict(String filename) throws IOException {
+ new File(filename).getParentFile().mkdirs();
+ OutputStream os = new FileOutputStream(filename);
+ try {
+ os = new BufferedOutputStream(os);
+ final DataOutput out = new OutputStreamDataOutput(os);
+ CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
+ out.writeVInt(inflDict.size());
+ for (String s : inflDict) {
+ String data[] = CSVUtil.parse(s);
+ assert data.length == 2 : "malformed inflection: " + s;
+ out.writeString(data[0]);
+ out.writeString(data[1]);
+ }
+ } finally {
+ os.close();
+ }
+ }
+
protected void writeDictionary(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
final FileOutputStream os = new FileOutputStream(filename);
@@ -259,4 +310,58 @@ public abstract class BinaryDictionaryWr
}
}
+ // TODO: the below is messy, but makes the dictionary smaller.
+ // we track frequencies of inflections so the highest-freq ones have smaller indexes.
+
+ /** optional: notes inflection seen in the data up front */
+ public void noteInflection(String entry[]) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(CSVUtil.quoteEscape(entry[8]));
+ sb.append(',');
+ sb.append(CSVUtil.quoteEscape(entry[9]));
+ String s = sb.toString();
+ if ("*,*".equals(s)) {
+ return; // no inflection data
+ }
+ Integer freq = notedInflections.get(s);
+ if (freq == null) {
+ freq = 0;
+ }
+ notedInflections.put(s, freq+1);
+ }
+
+ /** prepopulates inflection mapping by frequency */
+ public void finalizeInflections() {
+ InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
+ int upto = 0;
+ for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
+ freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
+ }
+ Arrays.sort(freqs, Collections.reverseOrder());
+ for (int i = 0; i < upto; i++) {
+ inflDict.add(freqs[i].inflection);
+ inflDictLookup.put(freqs[i].inflection, i);
+ }
+ }
+
+ static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
+ String inflection;
+ int freq;
+
+ InflectionAndFreq(String s, int i) {
+ this.inflection = s;
+ this.freq = i;
+ }
+
+ public int compareTo(InflectionAndFreq other) {
+ int cmp = freq - other.freq;
+ if (cmp == 0) {
+ return inflection.compareTo(other.inflection);
+ } else {
+ return cmp;
+ }
+ }
+ }
+
+ private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java Mon Jan 9 08:10:37 2012
@@ -63,7 +63,6 @@ public final class ConnectionCostsWriter
for (short[] a : costs) {
assert a.length == forwardSize;
for (int i = 0; i < a.length; i++) {
- // TODO: when delta is 0, maybe we should RLE
int delta = (int)a[i] - last;
out.writeVInt((delta >> 31) ^ (delta << 1));
last = a[i];
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Mon Jan 9 08:10:37 2012
@@ -94,11 +94,15 @@ public class TokenInfoDictionaryBuilder
String line = null;
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
+
if(entry.length < 13) {
System.out.println("Entry in CSV is not valid: " + line);
continue;
}
- lines.add(formatEntry(entry));
+
+ String[] formatted = formatEntry(entry);
+ dictionary.noteInflection(formatted);
+ lines.add(formatted);
// NFKC normalize dictionary entry
if (normalizeEntries) {
@@ -109,15 +113,19 @@ public class TokenInfoDictionaryBuilder
for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = normalizer.normalize(entry[i]);
}
-
- lines.add(formatEntry(normalizedEntry));
+
+ formatted = formatEntry(normalizedEntry);
+ dictionary.noteInflection(formatted);
+ lines.add(formatted);
}
}
}
+ dictionary.finalizeInflections();
+
System.out.println(" sort...");
- // sort by term
+ // sort by term, then cost, then all other features
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {
return left[0].compareTo(right[0]);