You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/17 03:12:28 UTC
svn commit: r1232265 - in /lucene/dev/trunk/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
resources/org/apache/lucene/analysis/kuromoji/dict/
test/org/apache/lucene/analysis/kuro...
Author: rmuir
Date: Tue Jan 17 02:12:27 2012
New Revision: 1232265
URL: http://svn.apache.org/viewvc?rev=1232265&view=rev
Log:
LUCENE-3699: share baseform with surface and flag if the reading can be computed from surface
Modified:
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Tue Jan 17 02:12:27 2012
@@ -75,14 +75,14 @@ public class Token {
* @return reading. null if token doesn't have reading.
*/
public String getReading() {
- return dictionary.getReading(wordId);
+ return dictionary.getReading(wordId, surfaceForm, offset, length);
}
/**
* @return pronunciation. null if token doesn't have pronunciation.
*/
public String getPronunciation() {
- return dictionary.getPronunciation(wordId);
+ return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
}
/**
@@ -110,7 +110,7 @@ public class Token {
* @return base form or null if token is not inflected
*/
public String getBaseForm() {
- return dictionary.getBaseForm(wordId);
+ return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
}
/**
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Tue Jan 17 02:12:27 2012
@@ -149,12 +149,12 @@ public abstract class BinaryDictionary i
@Override
public int getLeftId(int wordId) {
- return buffer.getShort(wordId) >>> 2;
+ return buffer.getShort(wordId) >>> 3;
}
@Override
public int getRightId(int wordId) {
- return buffer.getShort(wordId) >>> 2;
+ return buffer.getShort(wordId) >>> 3;
}
@Override
@@ -163,21 +163,42 @@ public abstract class BinaryDictionary i
}
@Override
- public String getBaseForm(int wordId) {
+ public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
if (hasBaseFormData(wordId)) {
int offset = baseFormOffset(wordId);
- int length = buffer.get(offset++) & 0xff;
- return readString(offset, length, false);
+ int data = buffer.get(offset++) & 0xff;
+ int prefix = data >>> 4;
+ int suffix = data & 0xF;
+ char text[] = new char[prefix+suffix];
+ System.arraycopy(surfaceForm, off, text, 0, prefix);
+ for (int i = 0; i < suffix; i++) {
+ text[prefix+i] = buffer.getChar(offset + (i << 1));
+ }
+ return new String(text);
} else {
return null;
}
}
@Override
- public String getReading(int wordId) {
- int offset = readingOffset(wordId);
- int readingData = buffer.get(offset++) & 0xff;
- return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+ public String getReading(int wordId, char surface[], int off, int len) {
+ if (hasReadingData(wordId)) {
+ int offset = readingOffset(wordId);
+ int readingData = buffer.get(offset++) & 0xff;
+ return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+ } else {
+ // the reading is the surface form, with hiragana shifted to katakana
+ char text[] = new char[len];
+ for (int i = 0; i < len; i++) {
+ char ch = surface[off+i];
+ if (ch > 0x3040 && ch < 0x3097) {
+ text[i] = (char)(ch + 0x60);
+ } else {
+ text[i] = ch;
+ }
+ }
+ return new String(text);
+ }
}
@Override
@@ -186,13 +207,13 @@ public abstract class BinaryDictionary i
}
@Override
- public String getPronunciation(int wordId) {
+ public String getPronunciation(int wordId, char surface[], int off, int len) {
if (hasPronunciationData(wordId)) {
int offset = pronunciationOffset(wordId);
int pronunciationData = buffer.get(offset++) & 0xff;
return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
} else {
- return getReading(wordId); // same as the reading
+ return getReading(wordId, surface, off, len); // same as the reading
}
}
@@ -213,7 +234,7 @@ public abstract class BinaryDictionary i
private int readingOffset(int wordId) {
int offset = baseFormOffset(wordId);
if (hasBaseFormData(wordId)) {
- int baseFormLength = buffer.get(offset++) & 0xff;
+ int baseFormLength = buffer.get(offset++) & 0xf;
return offset + (baseFormLength << 1);
} else {
return offset;
@@ -221,21 +242,29 @@ public abstract class BinaryDictionary i
}
private int pronunciationOffset(int wordId) {
- int offset = readingOffset(wordId);
- int readingData = buffer.get(offset++) & 0xff;
- final int readingLength;
- if ((readingData & 1) == 0) {
- readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+ if (hasReadingData(wordId)) {
+ int offset = readingOffset(wordId);
+ int readingData = buffer.get(offset++) & 0xff;
+ final int readingLength;
+ if ((readingData & 1) == 0) {
+ readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+ } else {
+ readingLength = readingData >>> 1;
+ }
+ return offset + readingLength;
} else {
- readingLength = readingData >>> 1;
+ return readingOffset(wordId);
}
- return offset + readingLength;
}
private boolean hasBaseFormData(int wordId) {
return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
}
+ private boolean hasReadingData(int wordId) {
+ return (buffer.getShort(wordId) & HAS_READING) != 0;
+ }
+
private boolean hasPronunciationData(int wordId) {
return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
}
@@ -256,6 +285,8 @@ public abstract class BinaryDictionary i
/** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
public static final int HAS_BASEFORM = 1;
+ /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
+ public static final int HAS_READING = 2;
/** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
- public static final int HAS_PRONUNCIATION = 2;
+ public static final int HAS_PRONUNCIATION = 4;
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Tue Jan 17 02:12:27 2012
@@ -54,21 +54,21 @@ public interface Dictionary {
* @param wordId word ID of token
* @return Reading of the token
*/
- public String getReading(int wordId);
+ public String getReading(int wordId, char surface[], int off, int len);
/**
* Get base form of word
* @param wordId word ID of token
* @return Base form (only different for inflected words, otherwise null)
*/
- public String getBaseForm(int wordId);
+ public String getBaseForm(int wordId, char surface[], int off, int len);
/**
* Get pronunciation of tokens
* @param wordId word ID of token
* @return Pronunciation of the token
*/
- public String getPronunciation(int wordId);
+ public String getPronunciation(int wordId, char surface[], int off, int len);
/**
* Get inflection type of tokens
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Tue Jan 17 02:12:27 2012
@@ -51,7 +51,7 @@ public final class UnknownDictionary ext
}
@Override
- public String getReading(int wordId) {
+ public String getReading(int wordId, char surface[], int off, int len) {
return null;
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Tue Jan 17 02:12:27 2012
@@ -196,7 +196,7 @@ public final class UserDictionary implem
}
@Override
- public String getReading(int wordId) {
+ public String getReading(int wordId, char surface[], int off, int len) {
return getFeature(wordId, 0);
}
@@ -206,12 +206,12 @@ public final class UserDictionary implem
}
@Override
- public String getBaseForm(int wordId) {
+ public String getBaseForm(int wordId, char surface[], int off, int len) {
return null; // TODO: add support?
}
@Override
- public String getPronunciation(int wordId) {
+ public String getPronunciation(int wordId, char surface[], int off, int len) {
return null; // TODO: add support?
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java Tue Jan 17 02:12:27 2012
@@ -61,7 +61,7 @@ public class TestTokenInfoDictionary ext
assertTrue(wordId > lastWordId);
lastWordId = wordId;
- String baseForm = tid.getBaseForm(wordId);
+ String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
String inflectionForm = tid.getInflectionForm(wordId);
@@ -91,11 +91,11 @@ public class TestTokenInfoDictionary ext
// check that its actually an ipadic pos tag
assertNotNull(ToStringUtil.getPOSTranslation(pos));
- String pronunciation = tid.getPronunciation(wordId);
+ String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
assertNotNull(pronunciation);
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
- String reading = tid.getReading(wordId);
+ String reading = tid.getReading(wordId, chars, 0, chars.length);
assertNotNull(reading);
assertTrue(UnicodeUtil.validUTF16String(reading));
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Tue Jan 17 02:12:27 2012
@@ -73,12 +73,12 @@ public class UserDictionaryTest extends
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
- assertEquals("ããã³", dictionary.getReading(wordIdNihon));
+ assertEquals("ããã³", dictionary.getReading(wordIdNihon, "æ¥æ¬".toCharArray(), 0, 2));
result = dictionary.lookup("æéé¾".toCharArray(), 0, 3);
assertEquals(1, result.length);
int wordIdAsashoryu = result[0][0]; // wordId for æéé¾
- assertEquals("ã¢ãµã·ã§ã¦ãªã¥ã¦", dictionary.getReading(wordIdAsashoryu));
+ assertEquals("ã¢ãµã·ã§ã¦ãªã¥ã¦", dictionary.getReading(wordIdAsashoryu, "æéé¾".toCharArray(), 0, 3));
}
@Test
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Tue Jan 17 02:12:27 2012
@@ -103,12 +103,15 @@ public abstract class BinaryDictionaryWr
if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
flags |= BinaryDictionary.HAS_BASEFORM;
}
+ if (!reading.equals(toKatakana(entry[0]))) {
+ flags |= BinaryDictionary.HAS_READING;
+ }
if (!pronunciation.equals(reading)) {
flags |= BinaryDictionary.HAS_PRONUNCIATION;
}
assert leftId == rightId;
- assert leftId < 8192; // there are still unused bits
+ assert leftId < 4096; // there are still unused bits
// add pos mapping
int toFill = 1+leftId - posDict.size();
for (int i = 0; i < toFill; i++) {
@@ -119,27 +122,36 @@ public abstract class BinaryDictionaryWr
assert existing == null || existing.equals(fullPOSData);
posDict.set(leftId, fullPOSData);
- buffer.putShort((short)(leftId << 2 | flags));
+ buffer.putShort((short)(leftId << 3 | flags));
buffer.putShort(wordCost);
if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
- buffer.put((byte) baseForm.length());
- for (int i = 0; i < baseForm.length(); i++) {
+ assert baseForm.length() < 16;
+ int shared = sharedPrefix(entry[0], baseForm);
+ int suffix = baseForm.length() - shared;
+ buffer.put((byte) (shared << 4 | suffix));
+ for (int i = shared; i < baseForm.length(); i++) {
buffer.putChar(baseForm.charAt(i));
}
}
- if (isKatakana(reading)) {
- buffer.put((byte) (reading.length() << 1 | 1));
- writeKatakana(reading);
- } else {
- buffer.put((byte) (reading.length() << 1));
- for (int i = 0; i < reading.length(); i++) {
- buffer.putChar(reading.charAt(i));
+ if ((flags & BinaryDictionary.HAS_READING) != 0) {
+ if (isKatakana(reading)) {
+ buffer.put((byte) (reading.length() << 1 | 1));
+ writeKatakana(reading);
+ } else {
+ buffer.put((byte) (reading.length() << 1));
+ for (int i = 0; i < reading.length(); i++) {
+ buffer.putChar(reading.charAt(i));
+ }
}
}
if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
+ // we can save 150KB here, but it makes the reader a little complicated.
+ // int shared = sharedPrefix(reading, pronunciation);
+ // buffer.put((byte) shared);
+ // pronunciation = pronunciation.substring(shared);
if (isKatakana(pronunciation)) {
buffer.put((byte) (pronunciation.length() << 1 | 1));
writeKatakana(pronunciation);
@@ -170,6 +182,27 @@ public abstract class BinaryDictionaryWr
}
}
+ private String toKatakana(String s) {
+ char text[] = new char[s.length()];
+ for (int i = 0; i < s.length(); i++) {
+ char ch = s.charAt(i);
+ if (ch > 0x3040 && ch < 0x3097) {
+ text[i] = (char)(ch + 0x60);
+ } else {
+ text[i] = ch;
+ }
+ }
+ return new String(text);
+ }
+
+ public static int sharedPrefix(String left, String right) {
+ int len = left.length() < right.length() ? left.length() : right.length();
+ for (int i = 0; i < len; i++)
+ if (left.charAt(i) != right.charAt(i))
+ return i;
+ return len;
+ }
+
public void addMapping(int sourceId, int wordId) {
assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;