You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/06 05:17:54 UTC
svn commit: r1227952 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/dict/
resources/org/apache/lucene/analysis/kuromoji/dict/
test/org/apache/lucene/analysis/kuromoji/ tools/java/org/apache/...
Author: rmuir
Date: Fri Jan 6 04:17:52 2012
New Revision: 1227952
URL: http://svn.apache.org/viewvc?rev=1227952&view=rev
Log:
LUCENE-3305: deduplicate POS, reduces tokeninfodict 14MB
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat (with props)
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Fri Jan 6 04:17:52 2012
@@ -37,18 +37,22 @@ public abstract class BinaryDictionary i
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
+ public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
- public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
public static final String DICT_HEADER = "kuromoji_dict";
+ public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
+ public static final String POSDICT_HEADER = "kuromoji_dict_pos";
public static final int VERSION = 1;
private final ByteBuffer buffer;
private final int[][] targetMap;
+ private final String[] posDict;
protected BinaryDictionary() throws IOException {
- InputStream mapIS = null, dictIS = null;
+ InputStream mapIS = null, dictIS = null, posIS = null;
IOException priorE = null;
int[][] targetMap = null;
+ String[] posDict = null;
ByteBuffer buffer = null;
try {
mapIS = getClass().getResourceAsStream(getClass().getSimpleName() + TARGETMAP_FILENAME_SUFFIX);
@@ -72,6 +76,17 @@ public abstract class BinaryDictionary i
j++;
}
}
+
+ posIS = getClass().getResourceAsStream(getClass().getSimpleName() + POSDICT_FILENAME_SUFFIX);
+ if (posIS == null)
+ throw new FileNotFoundException("Not in classpath: " + getClass().getName().replace('.','/') + POSDICT_FILENAME_SUFFIX);
+ posIS = new BufferedInputStream(posIS);
+ in = new InputStreamDataInput(posIS);
+ CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
+ posDict = new String[in.readVInt()];
+ for (int j = 0; j < posDict.length; j++) {
+ posDict[j] = in.readString();
+ }
dictIS = getClass().getResourceAsStream(getClass().getSimpleName() + DICT_FILENAME_SUFFIX);
if (dictIS == null)
@@ -89,10 +104,11 @@ public abstract class BinaryDictionary i
} catch (IOException ioe) {
priorE = ioe;
} finally {
- IOUtils.closeWhileHandlingException(priorE, mapIS, dictIS);
+ IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
}
this.targetMap = targetMap;
+ this.posDict = posDict;
this.buffer = buffer;
}
@@ -115,13 +131,22 @@ public abstract class BinaryDictionary i
return buffer.getShort(wordId + 4); // Skip left id and right id
}
+ // TODO: this method will likely never be efficient, do we need it?
@Override
public String[] getAllFeaturesArray(int wordId) {
- int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
- char[] targetArr = new char[size];
- int offset = wordId + 6 + 2; // offset is position where features string starts
+ char posIndex = buffer.getChar(wordId + 6); // read index into posDict
+ String pos = posDict[posIndex];
+ int posLen = pos.length();
+ int size = buffer.getShort(wordId + 8) / 2; // Read length of feature String. Skip 8 bytes, see data structure.
+ char[] targetArr = new char[posLen + 1 + size]; // pos + separator + the rest
+ int offset = wordId + 8 + 2; // offset is position where features string starts
+ for (int i = 0; i < pos.length(); i++) {
+ targetArr[i] = pos.charAt(i);
+ }
+ int upto = posLen;
+ targetArr[upto++] = INTERNAL_SEPARATOR.charAt(0);
for(int i = 0; i < size; i++){
- targetArr[i] = buffer.getChar(offset + i * 2);
+ targetArr[upto++] = buffer.getChar(offset + i * 2);
}
String allFeatures = new String(targetArr);
return allFeatures.split(INTERNAL_SEPARATOR);
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1227952&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1227952&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Fri Jan 6 04:17:52 2012
@@ -91,6 +91,21 @@ public class TokenizerTest extends Lucen
assertNull(tokens.get(8).getBaseForm());
}
+ @Test
+ public void testPartOfSpeech() {
+ List<Token> tokens = tokenizer.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
+ assertEquals(9, tokens.size());
+ assertEquals("åè©,代åè©,ä¸è¬,*", tokens.get(0).getPartOfSpeech());
+ assertEquals("å©è©,ä¿å©è©,*,*", tokens.get(1).getPartOfSpeech());
+ assertEquals("å¯è©,å©è©é¡æ¥ç¶,*,*", tokens.get(2).getPartOfSpeech());
+ assertEquals("åè©,ãµå¤æ¥ç¶,*,*", tokens.get(3).getPartOfSpeech());
+ assertEquals("åè©,ä¸è¬,*,*", tokens.get(4).getPartOfSpeech());
+ assertEquals("å©è©,æ ¼å©è©,ä¸è¬,*", tokens.get(5).getPartOfSpeech());
+ assertEquals("åè©,èªç«,*,*", tokens.get(6).getPartOfSpeech());
+ assertEquals("å©åè©,*,*,*", tokens.get(7).getPartOfSpeech());
+ assertEquals("è¨å·,å¥ç¹,*,*", tokens.get(8).getPartOfSpeech());
+ }
+
public void testBocchan() throws Exception {
doTestBocchan(1);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java Fri Jan 6 04:17:52 2012
@@ -25,6 +25,8 @@ import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
@@ -38,7 +40,8 @@ import org.apache.lucene.analysis.kuromo
public class TokenInfoDictionaryWriter {
protected ByteBuffer buffer;
protected int[][] targetMap = new int[1][];
-
+ protected List<String> posDict = new ArrayList<String>();
+
public TokenInfoDictionaryWriter(int size) {
targetMap = new int[1][];
buffer = ByteBuffer.allocate(size);
@@ -54,7 +57,23 @@ public class TokenInfoDictionaryWriter {
short wordCost = Short.parseShort(entry[3]);
StringBuilder sb = new StringBuilder();
- for (int i = 4; i < entry.length; i++){
+
+ // build up the POS string
+ for (int i = 4; i < 10; i++) {
+ sb.append(entry[i]);
+ if (i < 9) {
+ sb.append(Dictionary.INTERNAL_SEPARATOR);
+ }
+ }
+ String pos = sb.toString();
+ int posIndex = posDict.indexOf(pos);
+ if (posIndex < 0) {
+ posIndex = posDict.size();
+ posDict.add(pos);
+ }
+
+ sb.setLength(0);
+ for (int i = 10; i < entry.length; i++){
sb.append(entry[i]).append(Dictionary.INTERNAL_SEPARATOR);
}
String features = sb.deleteCharAt(sb.length() - 1).toString();
@@ -62,7 +81,7 @@ public class TokenInfoDictionaryWriter {
// extend buffer if necessary
int left = buffer.remaining();
- if (8 + featuresSize > left) { // four short and features
+ if (10 + featuresSize > left) { // five short and features
ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + 1, 1));
buffer.flip();
newBuffer.put(buffer);
@@ -72,6 +91,8 @@ public class TokenInfoDictionaryWriter {
buffer.putShort(leftId);
buffer.putShort(rightId);
buffer.putShort(wordCost);
+ assert posIndex < Character.MAX_VALUE;
+ buffer.putChar((char)posIndex);
buffer.putShort((short)featuresSize);
for (char c : features.toCharArray()){
buffer.putChar(c);
@@ -111,6 +132,7 @@ public class TokenInfoDictionaryWriter {
public void write(String baseDir) throws IOException {
writeDictionary(baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+ writePosDict(baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
}
protected void writeTargetMap(String filename) throws IOException {
@@ -150,6 +172,22 @@ public class TokenInfoDictionaryWriter {
}
}
+ protected void writePosDict(String filename) throws IOException {
+ new File(filename).getParentFile().mkdirs();
+ OutputStream os = new FileOutputStream(filename);
+ try {
+ os = new BufferedOutputStream(os);
+ final DataOutput out = new OutputStreamDataOutput(os);
+ CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
+ out.writeVInt(posDict.size());
+ for (String s : posDict) {
+ out.writeString(s);
+ }
+ } finally {
+ os.close();
+ }
+ }
+
protected void writeDictionary(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
final FileOutputStream os = new FileOutputStream(filename);
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java Fri Jan 6 04:17:52 2012
@@ -51,6 +51,7 @@ public class UnknownDictionaryWriter ext
public void write(String baseDir) throws IOException {
writeDictionary(baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+ writePosDict(baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
characterDefinition.write(baseDir);
}
}