You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2018/10/26 08:32:46 UTC
lucene-solr:branch_7x: LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer. This change also removes empty terms and trim surface form in Nori's Korean dictionary.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 329252fb9 -> 403babcfd


LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer.
This change also removes empty terms and trim surface form in Nori's Korean dictionary.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/403babcf
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/403babcf
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/403babcf

Branch: refs/heads/branch_7x
Commit: 403babcfd6d024affc8afad00f8fb78c07053e82
Parents: 329252f
Author: Jim Ferenczi <ji...@apache.org>
Authored: Fri Oct 26 10:28:37 2018 +0200
Committer: Jim Ferenczi <ji...@apache.org>
Committed: Fri Oct 26 10:31:34 2018 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   3 ++
 .../lucene/analysis/ko/KoreanTokenizer.java     |   4 +++
 .../ko/dict/TokenInfoDictionary$buffer.dat      | Bin 7245625 -> 7245613 bytes
 .../ko/dict/TokenInfoDictionary$fst.dat         | Bin 5640925 -> 5640903 bytes
 .../ko/dict/TokenInfoDictionary$targetMap.dat   | Bin 811783 -> 811783 bytes
 .../lucene/analysis/ko/TestKoreanTokenizer.java |   8 +++++
 .../ko/dict/TestTokenInfoDictionary.java        |   4 +++
 .../ko/util/BinaryDictionaryWriter.java         |  29 ++++++++++---------
 .../ko/util/TokenInfoDictionaryBuilder.java     |  17 ++++++-----
 9 files changed, 44 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a283880..8a748f3 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -26,6 +26,9 @@ Bug fixes:
   in the graph if the slop is greater than 0. Span queries cannot be used in this case because
   they don't handle slop the same way than phrase queries. (Steve Rowe, Uwe Schindler, Jim Ferenczi)
 
+* LUCENE-8524: Add the Hangul Letter Araea (interpunct) as a separator in Nori's tokenizer.
+  This change also removes empty terms and trim surface form in Nori's Korean dictionary. (Trey Jones, Jim Ferenczi)
+
 New Features
 
 * LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
index 822853b..ab3205f 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -932,6 +932,10 @@ public final class KoreanTokenizer extends Tokenizer {
   }
 
   private static boolean isPunctuation(char ch) {
+    // special case for Hangul Letter Araea (interpunct)
+    if (ch == 0x318D) {
+      return true;
+    }
     switch(Character.getType(ch)) {
       case Character.SPACE_SEPARATOR:
       case Character.LINE_SEPARATOR:

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat
index 6958664..d7cc866 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat
index 17b531f..fa0cb32 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$fst.dat differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat
index 7c0823c..4661bf8 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
index 0471e5f..7c204fa 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -289,6 +289,14 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
     );
   }
 
+  public void testInterpunct() throws IOException {
+    assertAnalyzesTo(analyzer, "도로ㆍ지반ㆍ수자원ㆍ건설환경ㆍ건축ㆍ화재설비연구",
+        new String[]{"도로", "지반", "수자원", "건설", "환경", "건축", "화재", "설비", "연구"},
+        new int[]{0, 3, 6, 10, 12, 15, 18, 20, 22},
+        new int[]{2, 5, 9, 12, 14, 17, 20, 22, 24},
+        new int[]{1, 1, 1, 1,   1,  1,  1,  1,  1}
+    );
+  }
 
   /** blast some random strings through the tokenizer */
   public void testRandomStrings() throws Exception {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
index d278841..3457de1 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestTokenInfoDictionary.java
@@ -48,6 +48,8 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
         chars[i] = (char)input.ints[input.offset+i];
       }
       String surfaceForm = new String(chars);
+      assertFalse(surfaceForm.isEmpty());
+      assertEquals(surfaceForm.trim(), surfaceForm);
       assertTrue(UnicodeUtil.validUTF16String(surfaceForm));
       
       Long output = mapping.output;
@@ -96,6 +98,8 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
             int offset = 0;
             for (Dictionary.Morpheme morph : decompound) {
               assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm));
+              assertFalse(morph.surfaceForm.isEmpty());
+              assertEquals(morph.surfaceForm.trim(), morph.surfaceForm);
               if (type != POS.Type.INFLECT) {
                 assertEquals(morph.surfaceForm, surfaceForm.substring(offset, offset + morph.surfaceForm.length()));
                 offset += morph.surfaceForm.length();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
index 35c16ae..b77d1ba 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
@@ -26,6 +26,7 @@ import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
 import org.apache.lucene.analysis.ko.POS;
 import org.apache.lucene.analysis.ko.dict.Dictionary;
@@ -109,23 +110,23 @@ public abstract class BinaryDictionaryWriter {
     assert existing == null || existing.equals(fullPOSData);
     posDict.set(leftId, fullPOSData);
 
-    final Dictionary.Morpheme[] morphemes;
+    final List<Dictionary.Morpheme> morphemes = new ArrayList<>();
     // true if the POS and decompounds of the token are all the same.
     boolean hasSinglePOS = (leftPOS == rightPOS);
     if (posType != POS.Type.MORPHEME && expression.length() > 0) {
       String[] exprTokens = expression.split("\\+");
-      morphemes = new Dictionary.Morpheme[exprTokens.length];
       for (int i = 0; i < exprTokens.length; i++) {
         String[] tokenSplit = exprTokens[i].split("\\/");
         assert tokenSplit.length == 3;
-        POS.Tag exprTag = POS.resolveTag(tokenSplit[1]);
-        morphemes[i] = new Dictionary.Morpheme(exprTag, tokenSplit[0]);
-        if (leftPOS != exprTag) {
-          hasSinglePOS = false;
+        String surfaceForm = tokenSplit[0].trim();
+        if (surfaceForm.isEmpty() == false) {
+          POS.Tag exprTag = POS.resolveTag(tokenSplit[1]);
+          morphemes.add(new Dictionary.Morpheme(exprTag, tokenSplit[0]));
+          if (leftPOS != exprTag) {
+            hasSinglePOS = false;
+          }
         }
       }
-    } else {
-      morphemes = new Dictionary.Morpheme[0];
     }
 
     int flags = 0;
@@ -151,17 +152,17 @@ public abstract class BinaryDictionaryWriter {
       if (hasSinglePOS == false) {
         buffer.put((byte) rightPOS.ordinal());
       }
-      buffer.put((byte) morphemes.length);
+      buffer.put((byte) morphemes.size());
       int compoundOffset = 0;
-      for (int i = 0; i < morphemes.length; i++) {
+      for (Dictionary.Morpheme morpheme : morphemes) {
         if (hasSinglePOS == false) {
-          buffer.put((byte) morphemes[i].posTag.ordinal());
+          buffer.put((byte) morpheme.posTag.ordinal());
         }
         if (posType != POS.Type.INFLECT) {
-          buffer.put((byte) morphemes[i].surfaceForm.length());
-          compoundOffset += morphemes[i].surfaceForm.length();
+          buffer.put((byte) morpheme.surfaceForm.length());
+          compoundOffset += morpheme.surfaceForm.length();
         } else {
-          writeString(morphemes[i].surfaceForm);
+          writeString(morpheme.surfaceForm);
         }
         assert compoundOffset <= entry[0].length() : Arrays.toString(entry);
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/403babcf/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
index de60daa..d5fb73f 100644
--- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
@@ -116,6 +116,10 @@ public class TokenInfoDictionaryBuilder {
 
     // build tokeninfo dictionary
     for (String[] entry : lines) {
+      String surfaceForm = entry[0].trim();
+      if (surfaceForm.isEmpty()) {
+        continue;
+      }
       int next = dictionary.put(entry);
 
       if(next == offset){
@@ -123,15 +127,14 @@ public class TokenInfoDictionaryBuilder {
         continue;
       }
 
-      String token = entry[0];
-      if (!token.equals(lastValue)) {
+      if (!surfaceForm.equals(lastValue)) {
         // new word to add to fst
         ord++;
-        lastValue = token;
-        scratch.grow(token.length());
-        scratch.setLength(token.length());
-        for (int i = 0; i < token.length(); i++) {
-          scratch.setIntAt(i, (int) token.charAt(i));
+        lastValue = surfaceForm;
+        scratch.grow(surfaceForm.length());
+        scratch.setLength(surfaceForm.length());
+        for (int i = 0; i < surfaceForm.length(); i++) {
+          scratch.setIntAt(i, (int) surfaceForm.charAt(i));
         }
         fstBuilder.add(scratch.get(), ord);
       }