You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2019/03/12 08:55:57 UTC
[lucene-solr] branch master updated: LUCENE-8631: The Korean user
dictionary now picks the longest-matching word and discards the other
matches.
This is an automated email from the ASF dual-hosted git repository.
jimczi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new b1f870a LUCENE-8631: The Korean user dictionary now picks the longest-matching word and discards the other matches.
b1f870a is described below
commit b1f870a4164769df62b24af63048aa2f9b21af47
Author: Yeongsu Kim <gr...@naver.com>
AuthorDate: Sat Feb 16 20:37:06 2019 +0900
LUCENE-8631: The Korean user dictionary now picks the longest-matching word and discards the other matches.
---
lucene/CHANGES.txt | 3 +++
.../apache/lucene/analysis/ko/KoreanTokenizer.java | 23 ++++++++++++++++++----
.../lucene/analysis/ko/TestKoreanTokenizer.java | 14 +++++++++++++
.../org/apache/lucene/analysis/ko/userdict.txt | 7 ++++++-
4 files changed, 42 insertions(+), 5 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index af23f0b..27ba1ed 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -39,6 +39,9 @@ Improvements
* LUCENE-8652: SynonymQuery can now deboost the document frequency of each term when
blending the score of the synonym. (Jim Ferenczi)
+* LUCENE-8631: The Korean's user dictionary now picks the longest-matching word and discards
+ the other matches. (Yeongsu Kim via Jim Ferenczi)
+
Other
* LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
index 8875fd0..af299e5 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -514,6 +514,9 @@ public final class KoreanTokenizer extends Tokenizer {
// Index of the last character of unknown word:
int unknownWordEndIndex = -1;
+ // Maximum posAhead of user word in the entire input
+ int userWordMaxPosAhead = -1;
+
// Advances over each position (character):
while (true) {
@@ -651,6 +654,10 @@ public final class KoreanTokenizer extends Tokenizer {
if (userFST != null) {
userFST.getFirstArc(arc);
int output = 0;
+ int maxPosAhead = 0;
+ int outputMaxPosAhead = 0;
+ int arcFinalOutMaxPosAhead = 0;
+
for(int posAhead=pos;;posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
@@ -661,13 +668,21 @@ public final class KoreanTokenizer extends Tokenizer {
}
output += arc.output.intValue();
if (arc.isFinal()) {
- if (VERBOSE) {
- System.out.println(" USER word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
- }
- add(userDictionary, posData, pos, posAhead+1, output + arc.nextFinalOutput.intValue(), Type.USER);
+ maxPosAhead = posAhead;
+ outputMaxPosAhead = output;
+ arcFinalOutMaxPosAhead = arc.nextFinalOutput.intValue();
anyMatches = true;
}
}
+
+ // Longest matching for user word
+ if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
+ if (VERBOSE) {
+ System.out.println(" USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
+ }
+ add(userDictionary, posData, pos, maxPosAhead+1, outputMaxPosAhead+arcFinalOutMaxPosAhead, Type.USER);
+ userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
+ }
}
// TODO: we can be more aggressive about user
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
index 50104ff..85dab4e 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -287,6 +287,20 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
new POS.Tag[]{POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG},
new POS.Tag[]{POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG}
);
+
+ assertAnalyzesTo(analyzer, "대한민국날씨",
+ new String[]{"대한민국날씨"},
+ new int[]{0},
+ new int[]{6},
+ new int[]{1}
+ );
+
+ assertAnalyzesTo(analyzer, "21세기대한민국",
+ new String[]{"21세기대한민국"},
+ new int[]{0},
+ new int[]{8},
+ new int[]{1}
+ );
}
public void testInterpunct() throws IOException {
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
index 63c1c3a..045b64e 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt
@@ -2,4 +2,9 @@
c++
C샤프
세종
-세종시 세종 시
\ No newline at end of file
+세종시 세종 시
+대한민국날씨
+대한민국
+날씨
+21세기대한민국
+세기
\ No newline at end of file