You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2019/03/12 08:55:57 UTC

[lucene-solr] branch master updated: LUCENE-8631: The Korean user dictionary now picks the longest-matching word and discards the other matches.

This is an automated email from the ASF dual-hosted git repository.

jimczi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new b1f870a  LUCENE-8631: The Korean user dictionary now picks the longest-matching word and discards the other matches.
b1f870a is described below

commit b1f870a4164769df62b24af63048aa2f9b21af47
Author: Yeongsu Kim <gr...@naver.com>
AuthorDate: Sat Feb 16 20:37:06 2019 +0900

    LUCENE-8631: The Korean user dictionary now picks the longest-matching word and discards the other matches.
---
 lucene/CHANGES.txt                                 |  3 +++
 .../apache/lucene/analysis/ko/KoreanTokenizer.java | 23 ++++++++++++++++++----
 .../lucene/analysis/ko/TestKoreanTokenizer.java    | 14 +++++++++++++
 .../org/apache/lucene/analysis/ko/userdict.txt     |  7 ++++++-
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index af23f0b..27ba1ed 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -39,6 +39,9 @@ Improvements
 * LUCENE-8652: SynonymQuery can now deboost the document frequency of each term when
   blending the score of the synonym. (Jim Ferenczi)
 
+* LUCENE-8631: The Korean's user dictionary now picks the longest-matching word and discards
+  the other matches. (Yeongsu Kim via Jim Ferenczi)
+
 Other
 
 * LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
index 8875fd0..af299e5 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -514,6 +514,9 @@ public final class KoreanTokenizer extends Tokenizer {
     // Index of the last character of unknown word:
     int unknownWordEndIndex = -1;
 
+    // Maximum posAhead of user word in the entire input
+    int userWordMaxPosAhead = -1;
+
     // Advances over each position (character):
     while (true) {
 
@@ -651,6 +654,10 @@ public final class KoreanTokenizer extends Tokenizer {
       if (userFST != null) {
         userFST.getFirstArc(arc);
         int output = 0;
+        int maxPosAhead = 0;
+        int outputMaxPosAhead = 0;
+        int arcFinalOutMaxPosAhead = 0;
+
         for(int posAhead=pos;;posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
@@ -661,13 +668,21 @@ public final class KoreanTokenizer extends Tokenizer {
           }
           output += arc.output.intValue();
           if (arc.isFinal()) {
-            if (VERBOSE) {
-              System.out.println("    USER word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
-            }
-            add(userDictionary, posData, pos, posAhead+1, output + arc.nextFinalOutput.intValue(), Type.USER);
+            maxPosAhead = posAhead;
+            outputMaxPosAhead = output;
+            arcFinalOutMaxPosAhead = arc.nextFinalOutput.intValue();
             anyMatches = true;
           }
         }
+
+        // Longest matching for user word
+        if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
+          if (VERBOSE) {
+            System.out.println("    USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
+          }
+          add(userDictionary, posData, pos, maxPosAhead+1, outputMaxPosAhead+arcFinalOutMaxPosAhead, Type.USER);
+          userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
+        } 
       }
 
       // TODO: we can be more aggressive about user
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
index 50104ff..85dab4e 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -287,6 +287,20 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
         new POS.Tag[]{POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG},
         new POS.Tag[]{POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG}
     );
+
+    assertAnalyzesTo(analyzer, "대한민국날씨",
+        new String[]{"대한민국날씨"},
+        new int[]{0},
+        new int[]{6},
+        new int[]{1}
+    );
+
+    assertAnalyzesTo(analyzer, "21세기대한민국",
+        new String[]{"21세기대한민국"},
+        new int[]{0},
+        new int[]{8},
+        new int[]{1}
+    );
   }
 
   public void testInterpunct() throws IOException {
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
index 63c1c3a..045b64e 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
@@ -2,4 +2,9 @@
 c++
 C샤프
 세종
-세종시 세종 시
\ No newline at end of file
+세종시 세종 시
+대한민국날씨
+대한민국
+날씨
+21세기대한민국
+세기
\ No newline at end of file