You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2015/05/11 10:03:57 UTC
svn commit: r1678685 - in /lucene/dev/trunk/lucene: ./
analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/
analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/
analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/
Author: cm
Date: Mon May 11 08:03:56 2015
New Revision: 1678685
URL: http://svn.apache.org/r1678685
Log:
Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1678685&r1=1678684&r2=1678685&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon May 11 08:03:56 2015
@@ -162,6 +162,9 @@ Bug Fixes
* LUCENE-6427: Added assertion about the presence of ghost bits in
(Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
+* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
+ (Jun Ohtani via Christian Moen)
+
API Changes
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java?rev=1678685&r1=1678684&r2=1678685&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java Mon May 11 08:03:56 2015
@@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory ex
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = new InputStreamReader(stream, decoder);
- userDictionary = new UserDictionary(reader);
+ userDictionary = UserDictionary.open(reader);
} else {
userDictionary = null;
}
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java?rev=1678685&r1=1678684&r2=1678685&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java Mon May 11 08:03:56 2015
@@ -56,18 +56,18 @@ public final class UserDictionary implem
public static final int LEFT_ID = 5;
public static final int RIGHT_ID = 5;
-
- public UserDictionary(Reader reader) throws IOException {
+
+ public static UserDictionary open(Reader reader) throws IOException {
+
BufferedReader br = new BufferedReader(reader);
String line = null;
- int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
List<String[]> featureEntries = new ArrayList<>();
-
+
// text, segmentation, readings, POS
while ((line = br.readLine()) != null) {
// Remove comments
line = line.replaceAll("#.*$", "");
-
+
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
continue;
@@ -75,7 +75,17 @@ public final class UserDictionary implem
String[] values = CSVUtil.parse(line);
featureEntries.add(values);
}
-
+
+ if (featureEntries.isEmpty()) {
+ return null;
+ } else {
+ return new UserDictionary(featureEntries);
+ }
+ }
+
+ private UserDictionary(List<String[]> featureEntries) throws IOException {
+
+ int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
// TODO: should we allow multiple segmentations per input 'phrase'?
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java?rev=1678685&r1=1678684&r2=1678685&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java Mon May 11 08:03:56 2015
@@ -22,6 +22,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
+import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.Random;
@@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
-public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
+public class
+ TestJapaneseTokenizer extends BaseTokenStreamTestCase {
public static UserDictionary readDict() {
InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
@@ -49,7 +51,7 @@ public class TestJapaneseTokenizer exten
try {
try {
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
- return new UserDictionary(reader);
+ return UserDictionary.open(reader);
} finally {
is.close();
}
@@ -686,4 +688,24 @@ public class TestJapaneseTokenizer exten
new int[] { 1, 1, 1, 1, 1},
new int[] { 1, 1, 1, 1, 1});
}
+
+ public void testEmptyUserDict() throws Exception {
+ Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
+ UserDictionary emptyDict = UserDictionary.open(emptyReader);
+
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ assertAnalyzesTo(analyzer, "ããã¯æ¬ã§ã¯ãªã",
+ new String[]{"ãã", "ã¯", "æ¬", "ã§", "ã¯", "ãªã"},
+ new int[]{0, 2, 3, 4, 5, 6},
+ new int[]{2, 3, 4, 5, 6, 8}
+ );
+ analyzer.close();
+ }
}