You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by so...@apache.org on 2019/06/20 22:39:20 UTC
[lucene-solr] branch master updated: LUCENE-8863: enhance Kuromoji
DictionaryBuilder tool added tests enabled ids up to 8191 support loading
custom system dictionary from filesystem or classpath
This is an automated email from the ASF dual-hosted git repository.
sokolov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 4502065 LUCENE-8863: enhance Kuromoji DictionaryBuilder tool added tests enabled ids up to 8191 support loading custom system dictionary from filesystem or classpath
4502065 is described below
commit 4502065f03654af204f23d7c90ee95c28d97f987
Author: Michael Sokolov <so...@amazon.com>
AuthorDate: Sat Jun 15 14:59:46 2019 -0400
LUCENE-8863: enhance Kuromoji DictionaryBuilder tool
added tests
enabled ids up to 8191
support loading custom system dictionary from filesystem or classpath
---
lucene/analysis/kuromoji/build.xml | 4 +-
.../lucene/analysis/ja/dict/BinaryDictionary.java | 54 ++++++++++++--
.../analysis/ja/dict/TokenInfoDictionary.java | 15 +++-
.../analysis/ja/util/BinaryDictionaryWriter.java | 43 ++++++++---
.../ja/util/TokenInfoDictionaryBuilder.java | 3 +-
.../analysis/ja/dict/TokenInfoDictionaryTest.java | 85 ++++++++++++++++++++++
.../analysis/ja/dict/UnknownDictionaryTest.java | 22 +-----
7 files changed, 185 insertions(+), 41 deletions(-)
diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
index decfa7a..094e2bd 100644
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@@ -136,8 +136,8 @@
</compile>
</target>
- <target name="test-tools" depends="compile-tools-tests">
- <test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
+ <target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
+ <test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
</target>
<target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
index 180c0a9..ca4eb2c 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
@@ -23,6 +23,8 @@ import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Paths;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
@@ -37,6 +39,10 @@ import org.apache.lucene.util.IOUtils;
*/
public abstract class BinaryDictionary implements Dictionary {
+ enum ResourceScheme {
+ CLASSPATH, FILE
+ }
+
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
@@ -46,6 +52,8 @@ public abstract class BinaryDictionary implements Dictionary {
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
public static final int VERSION = 1;
+ private final ResourceScheme resourceScheme;
+ private final String resourcePath;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final String[] posDict;
@@ -53,6 +61,24 @@ public abstract class BinaryDictionary implements Dictionary {
private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
+ this(ResourceScheme.CLASSPATH, null);
+ }
+
+ /**
+ * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
+ * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
+ * this class's name as the path.
+ */
+ protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
+ this.resourceScheme = resourceScheme;
+ if (resourcePath == null) {
+ if (resourceScheme != ResourceScheme.CLASSPATH) {
+ throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme");
+ }
+ this.resourcePath = getClass().getName().replace('.', '/');
+ } else {
+ this.resourcePath = resourcePath;
+ }
InputStream mapIS = null, dictIS = null, posIS = null;
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
@@ -78,7 +104,9 @@ public abstract class BinaryDictionary implements Dictionary {
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
- throw new IOException("targetMap file format broken");
+ throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length
+ + ", targetMapOffsets.length=" + targetMapOffsets.length
+ + ", sourceId=" + sourceId);
targetMapOffsets[sourceId] = targetMap.length;
mapIS.close(); mapIS = null;
@@ -135,17 +163,33 @@ public abstract class BinaryDictionary implements Dictionary {
}
protected final InputStream getResource(String suffix) throws IOException {
- return getClassResource(getClass(), suffix);
+ switch(resourceScheme) {
+ case CLASSPATH:
+ return getClassResource(resourcePath + suffix);
+ case FILE:
+ return Files.newInputStream(Paths.get(resourcePath + suffix));
+ default:
+ throw new IllegalStateException("unknown resource scheme " + resourceScheme);
+ }
}
// util, reused by ConnectionCosts and CharacterDefinition
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
- if (is == null)
+ if (is == null) {
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
+ }
return is;
}
+ private InputStream getClassResource(String path) throws IOException {
+ final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
+ if (is == null) {
+ throw new FileNotFoundException("Not in classpath: " + path);
+ }
+ return is;
+ }
+
public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
@@ -155,12 +199,12 @@ public abstract class BinaryDictionary implements Dictionary {
@Override
public int getLeftId(int wordId) {
- return buffer.getShort(wordId) >>> 3;
+ return (buffer.getShort(wordId) & 0xffff) >>> 3;
}
@Override
public int getRightId(int wordId) {
- return buffer.getShort(wordId) >>> 3;
+ return (buffer.getShort(wordId) & 0xffff) >>> 3;
}
@Override
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
index 6b75138..662ebb5 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
@@ -36,10 +36,15 @@ public final class TokenInfoDictionary extends BinaryDictionary {
private final TokenInfoFST fst;
- private TokenInfoDictionary() throws IOException {
- super();
+ /**
+ * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
+ * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
+ * this class's name as the path.
+ */
+ TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
+ super(resourceScheme, resourcePath);
InputStream is = null;
- FST<Long> fst = null;
+ FST<Long> fst;
boolean success = false;
try {
is = getResource(FST_FILENAME_SUFFIX);
@@ -56,6 +61,10 @@ public final class TokenInfoDictionary extends BinaryDictionary {
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);
}
+
+ private TokenInfoDictionary() throws IOException {
+ this(ResourceScheme.CLASSPATH, null);
+ }
public TokenInfoFST getFST() {
return fst;
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
index a6d48cc..a6ef6bb 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
@@ -35,6 +35,8 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.analysis.ja.dict.BinaryDictionary;
public abstract class BinaryDictionaryWriter {
+ private final static int ID_LIMIT = 8192;
+
protected final Class<? extends BinaryDictionary> implClazz;
protected ByteBuffer buffer;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
@@ -71,7 +73,9 @@ public abstract class BinaryDictionaryWriter {
}
String posData = sb.toString();
-
+ if (posData.isEmpty()) {
+ throw new IllegalArgumentException("POS fields are empty");
+ }
sb.setLength(0);
sb.append(CSVUtil.quoteEscape(posData));
sb.append(',');
@@ -100,6 +104,9 @@ public abstract class BinaryDictionaryWriter {
}
int flags = 0;
+ if (baseForm.isEmpty()) {
+ throw new IllegalArgumentException("base form is empty");
+ }
if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
flags |= BinaryDictionary.HAS_BASEFORM;
}
@@ -110,8 +117,12 @@ public abstract class BinaryDictionaryWriter {
flags |= BinaryDictionary.HAS_PRONUNCIATION;
}
- assert leftId == rightId;
- assert leftId < 4096; // there are still unused bits
+ if (leftId != rightId) {
+ throw new IllegalArgumentException("rightId != leftId: " + rightId + " " +leftId);
+ }
+ if (leftId >= ID_LIMIT) {
+ throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId);
+ }
// add pos mapping
int toFill = 1+leftId - posDict.size();
for (int i = 0; i < toFill; i++) {
@@ -119,14 +130,19 @@ public abstract class BinaryDictionaryWriter {
}
String existing = posDict.get(leftId);
- assert existing == null || existing.equals(fullPOSData);
+ if (existing != null && existing.equals(fullPOSData) == false) {
+ // TODO: test me
+ throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId);
+ }
posDict.set(leftId, fullPOSData);
buffer.putShort((short)(leftId << 3 | flags));
buffer.putShort(wordCost);
if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
- assert baseForm.length() < 16;
+ if (baseForm.length() >= 16) {
+ throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16");
+ }
int shared = sharedPrefix(entry[0], baseForm);
int suffix = baseForm.length() - shared;
buffer.put((byte) (shared << 4 | suffix));
@@ -204,16 +220,17 @@ public abstract class BinaryDictionaryWriter {
}
public void addMapping(int sourceId, int wordId) {
- assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
+ if (wordId <= lastWordId) {
+ throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
+ }
if (sourceId > lastSourceId) {
- assert sourceId > lastSourceId : "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId;
targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
for (int i = lastSourceId + 1; i <= sourceId; i++) {
targetMapOffsets[i] = targetMapEndOffset;
}
- } else {
- assert sourceId == lastSourceId;
+ } else if (sourceId != lastSourceId) {
+ throw new IllegalStateException("source ids not in increasing order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId);
}
targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
@@ -265,7 +282,9 @@ public abstract class BinaryDictionaryWriter {
}
prev += delta;
}
- assert sourceId == numSourceIds : "sourceId:"+sourceId+" != numSourceIds:"+numSourceIds;
+ if (sourceId != numSourceIds) {
+ throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
+ }
} finally {
os.close();
}
@@ -286,7 +305,9 @@ public abstract class BinaryDictionaryWriter {
out.writeByte((byte)0);
} else {
String data[] = CSVUtil.parse(s);
- assert data.length == 3 : "malformed pos/inflection: " + s;
+ if (data.length != 3) {
+ throw new IllegalArgumentException("Malformed pos/inflection: " + s + "; expected 3 characters");
+ }
out.writeString(data[0]);
out.writeString(data[1]);
out.writeString(data[2]);
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
index 7175308..465a432 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@@ -96,8 +96,7 @@ public class TokenInfoDictionaryBuilder {
String[] entry = CSVUtil.parse(line);
if(entry.length < 13) {
- System.out.println("Entry in CSV is not valid: " + line);
- continue;
+ throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
}
String[] formatted = formatEntry(entry);
diff --git a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
new file mode 100644
index 0000000..0f7609f
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.dict;
+
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
+import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryBuilder;
+import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryWriter;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.LuceneTestCase;
+
+import static java.io.File.separatorChar;
+import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
+
+/**
+ * Tests of TokenInfoDictionary build tools; run using ant test-tools
+ */
+public class TokenInfoDictionaryTest extends LuceneTestCase {
+
+ public void testPut() throws Exception {
+ TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
+ // "large" id
+ "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
+ IntsRef wordIdRef = new IntsRefBuilder().get();
+
+ dict.lookupWordIds(0, wordIdRef);
+ int wordId = wordIdRef.ints[wordIdRef.offset];
+ assertEquals(5000, dict.getLeftId(wordId));
+ assertEquals(5000, dict.getRightId(wordId));
+ assertEquals(3, dict.getWordCost(wordId));
+
+ dict.lookupWordIds(1, wordIdRef);
+ wordId = wordIdRef.ints[wordIdRef.offset];
+ assertEquals(1, dict.getLeftId(wordId));
+ assertEquals(1, dict.getRightId(wordId));
+ assertEquals(2, dict.getWordCost(wordId));
+ }
+
+ private TokenInfoDictionary newDictionary(String... entries) throws Exception {
+ Path dir = createTempDir();
+ try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
+ PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
+ for (String entry : entries) {
+ printer.println(entry);
+ }
+ }
+ TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder(DictionaryFormat.IPADIC, "utf-8", true);
+ TokenInfoDictionaryWriter writer = builder.build(dir.toString());
+ writer.write(dir.toString());
+ String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
+ // We must also load the other files (in BinaryDictionary) from the correct path
+ return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
+ }
+
+ public void testPutException() throws Exception {
+ // too few columns
+ expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
+ // left id != right id
+ expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
+ // left id != right id
+ expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
+ // id too large
+ expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
+ }
+}
diff --git a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
index 299e8c6..6330f41 100644
--- a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
+++ b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
@@ -29,19 +29,9 @@ public class UnknownDictionaryTest extends LuceneTestCase {
public void testPutCharacterCategory() {
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
- try{
- unkDic.putCharacterCategory(0, "DUMMY_NAME");
- fail();
- } catch(Exception e) {
-
- }
+ expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME"));
- try{
- unkDic.putCharacterCategory(-1, "KATAKANA");
- fail();
- } catch(Exception e) {
-
- }
+ expectThrows(Exception.class, () -> unkDic.putCharacterCategory(-1, "KATAKANA"));
unkDic.putCharacterCategory(0, "DEFAULT");
unkDic.putCharacterCategory(1, "GREEK");
@@ -53,12 +43,8 @@ public class UnknownDictionaryTest extends LuceneTestCase {
@Test
public void testPut() {
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
- try{
- unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*"));
- fail();
- } catch(Exception e){
-
- }
+ expectThrows(NumberFormatException.class, () ->
+ unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")));
String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";