You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2019/08/14 03:04:59 UTC
[lucene-solr] branch master updated: LUCENE-8933: Validate
JapaneseTokenizer user dictionary entry (#809)
This is an automated email from the ASF dual-hosted git repository.
tomoko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 73ba88a LUCENE-8933: Validate JapaneseTokenizer user dictionary entry (#809)
73ba88a is described below
commit 73ba88a50dec64f367caa88d277c26dfd1d8883b
Author: Tomoko Uchida <to...@apache.org>
AuthorDate: Wed Aug 14 12:04:52 2019 +0900
LUCENE-8933: Validate JapaneseTokenizer user dictionary entry (#809)
* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry if the concatenated segment is same as its surface form.
---
lucene/CHANGES.txt | 2 ++
lucene/MIGRATE.txt | 12 ++++++++++++
.../apache/lucene/analysis/ja/dict/UserDictionary.java | 8 ++++++++
.../lucene/analysis/ja/dict/UserDictionaryTest.java | 16 ++++++++++++++++
4 files changed, 38 insertions(+)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1db0da0..46b0bd9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -33,6 +33,8 @@ API Changes
* LUCENE-8948: Change "name" argument in ICU factories to "form". Here, "form" is
named after "Unicode Normalization Form". (Tomoko Uchida)
+* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry. (Tomoko Uchida)
+
Improvements
* LUCENE-8757: When provided with an ExecutorService to run queries across
diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt
index a063249..5890e0d 100644
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@@ -1,5 +1,17 @@
# Apache Lucene Migration Guide
+## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
+
+User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
+unexpected runtime exceptions or behaviours.
+For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file.
+
+# concatenated "日本経済新聞" does not match the surface form "日経新聞"
+日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
+
+# concatenated "日経新聞" does not match the surface form "日本経済新聞"
+日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞
+
## Analysis factories now have customizable symbolic names (LUCENE-8778) ##
The SPI names for concrete subclasses of TokenizerFactory, TokenFilterFactory, and CharfilterFactory are no longer
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
index eaa5bad..515c1d1 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@@ -104,6 +104,8 @@ public final class UserDictionary implements Dictionary {
long ord = 0;
for (String[] values : featureEntries) {
+ String surface = values[0].replaceAll("\\s", "");
+ String concatenatedSegment = values[1].replaceAll("\\s", "");
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
String[] readings = values[2].replaceAll(" *", " ").split(" ");
String pos = values[3];
@@ -113,6 +115,12 @@ public final class UserDictionary implements Dictionary {
" - the number of segmentations (" + segmentation.length + ")" +
" does not the match number of readings (" + readings.length + ")");
}
+
+ if (!surface.equals(concatenatedSegment)) {
+ throw new RuntimeException("Illegal user dictionary entry " + values[0] +
+ " - the concatenated segmentation (" + concatenatedSegment + ")" +
+ " does not match the surface form (" + surface + ")");
+ }
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
wordIdAndLength[0] = wordId;
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
index 88a366f..08a21c8 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.ja.TestJapaneseTokenizer;
import org.apache.lucene.util.LuceneTestCase;
@@ -77,4 +78,19 @@ public class UserDictionaryTest extends LuceneTestCase {
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
assertNotNull(dictionary);
}
+
+ @Test(expected = RuntimeException.class)
+ public void testReadInvalid1() throws IOException {
+ // the concatenated segment must be the same as the surface form
+ String invalidEntry = "日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞";
+ UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
+ }
+
+ @Test(expected = RuntimeException.class)
+ public void testReadInvalid2() throws IOException {
+ // the concatenated segment must be the same as the surface form
+ String invalidEntry = "日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞";
+ UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
+ }
+
}