You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/12/07 09:57:24 UTC
[09/37] lucene-solr:jira/http2: LUCENE-8548: The KoreanTokenizer no
longer splits unknown words on combining diacritics and detects script
boundaries more accurately with Character#UnicodeScript#of.
LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and
detects script boundaries more accurately with Character#UnicodeScript#of.
Signed-off-by: Jim Ferenczi <ji...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/643ffc6f
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/643ffc6f
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/643ffc6f
Branch: refs/heads/jira/http2
Commit: 643ffc6f9fb3f7368d48975d750f75f8a66783e2
Parents: b6b9f95
Author: Christophe Bismuth <ch...@gmail.com>
Authored: Wed Nov 21 13:58:44 2018 +0100
Committer: Jim Ferenczi <ji...@apache.org>
Committed: Mon Dec 3 10:57:22 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 +++
.../lucene/analysis/ko/KoreanTokenizer.java | 51 ++++++++++++++++----
.../lucene/analysis/ko/TestKoreanAnalyzer.java | 2 +-
.../lucene/analysis/ko/TestKoreanTokenizer.java | 38 +++++++++++++++
4 files changed, 87 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/643ffc6f/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 469d6fb..6b001b9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -214,6 +214,12 @@ Build
* LUCENE-8537: ant test command fails under lucene/tools (Peter Somogyi)
+Bug fixes:
+
+* LUCENE-8548: The KoreanTokenizer no longer splits unknown words on combining diacritics and
+ detects script boundaries more accurately with Character#UnicodeScript#of.
+ (Christophe Bismuth, Jim Ferenczi)
+
New Features
* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/643ffc6f/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
index ab3205f..012352c 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -43,6 +43,8 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
+import static java.lang.Character.UnicodeScript;
+
/**
* Tokenizer for Korean that uses morphological analysis.
* <p>
@@ -718,27 +720,42 @@ public final class KoreanTokenizer extends Tokenizer {
if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
// Find unknown match:
- final int characterId = characterDefinition.getCharacterClass(firstCharacter);
- final boolean isPunct = isPunctuation(firstCharacter);
-
+ int characterId = characterDefinition.getCharacterClass(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
if (!characterDefinition.isGroup(firstCharacter)) {
unknownWordLength = 1;
} else {
- // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+ // Extract unknown word. Characters with the same script are considered to be part of unknown word
unknownWordLength = 1;
+ UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
+ final boolean isPunct = isPunctuation(firstCharacter);
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
- final int ch = buffer.get(posAhead);
- if (ch == -1) {
+ int next = buffer.get(posAhead);
+ if (next == -1) {
break;
}
- if (characterId == characterDefinition.getCharacterClass((char) ch) &&
- isPunctuation((char) ch) == isPunct) {
+ char ch = (char) next;
+ int chType = Character.getType(ch);
+ UnicodeScript sc = UnicodeScript.of(next);
+ boolean sameScript = isSameScript(scriptCode, sc)
+ // Non-spacing marks inherit the script of their base character,
+ // following recommendations from UTR #24.
+ || chType == Character.NON_SPACING_MARK;
+
+ if (sameScript
+ && isPunctuation(ch, chType) == isPunct
+ && characterDefinition.isGroup(ch)) {
unknownWordLength++;
} else {
break;
}
+ // Update the script code and character class if the original script
+ // is Inherited or Common.
+ if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
+ scriptCode = sc;
+ characterId = characterDefinition.getCharacterClass(ch);
+ }
}
}
@@ -932,11 +949,15 @@ public final class KoreanTokenizer extends Tokenizer {
}
private static boolean isPunctuation(char ch) {
+ return isPunctuation(ch, Character.getType(ch));
+ }
+
+ private static boolean isPunctuation(char ch, int cid) {
// special case for Hangul Letter Araea (interpunct)
if (ch == 0x318D) {
return true;
}
- switch(Character.getType(ch)) {
+ switch(cid) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
@@ -958,4 +979,16 @@ public final class KoreanTokenizer extends Tokenizer {
return false;
}
}
+
+ private static boolean isCommonOrInherited(UnicodeScript script) {
+ return script == UnicodeScript.INHERITED ||
+ script == UnicodeScript.COMMON;
+ }
+
+ /** Determine if two scripts are compatible. */
+ private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) {
+ return scriptOne == scriptTwo
+ || isCommonOrInherited(scriptOne)
+ || isCommonOrInherited(scriptTwo);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/643ffc6f/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
index fd574ce..a56047a 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
@@ -106,4 +106,4 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
new int[]{1, 1, 1}
);
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/643ffc6f/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
index 7c204fa..50104ff 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -328,6 +328,44 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
analyzer.close();
}
+ public void testCombining() throws IOException {
+ assertAnalyzesTo(analyzer, "Ба̀лтичко мо̑ре",
+ new String[]{"Ба̀лтичко", "мо̑ре"},
+ new int[]{0, 10},
+ new int[]{9, 15},
+ new int[]{1, 1}
+ );
+ assertPartsOfSpeech(analyzer, "Ба̀лтичко мо̑ре",
+ new POS.Type[]{POS.Type.MORPHEME, POS.Type.MORPHEME},
+ new POS.Tag[]{POS.Tag.SL, POS.Tag.SL},
+ new POS.Tag[]{POS.Tag.SL, POS.Tag.SL}
+ );
+
+ assertAnalyzesTo(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
+ new String[]{"ka̠k̚t͡ɕ͈a̠k̚"},
+ new int[]{0},
+ new int[]{13},
+ new int[]{1}
+ );
+ assertPartsOfSpeech(analyzer, "ka̠k̚t͡ɕ͈a̠k̚",
+ new POS.Type[]{POS.Type.MORPHEME},
+ new POS.Tag[]{POS.Tag.SL},
+ new POS.Tag[]{POS.Tag.SL}
+ );
+
+ assertAnalyzesTo(analyzer, "εἰμί",
+ new String[]{"εἰμί"},
+ new int[]{0},
+ new int[]{4},
+ new int[]{1}
+ );
+ assertPartsOfSpeech(analyzer, "εἰμί",
+ new POS.Type[]{POS.Type.MORPHEME},
+ new POS.Tag[]{POS.Tag.SL},
+ new POS.Tag[]{POS.Tag.SL}
+ );
+ }
+
private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException {
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);