You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2018/05/23 14:12:51 UTC
lucene-solr:master: LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.

Repository: lucene-solr
Updated Branches:
  refs/heads/master 14a7cd115 -> 55858d7ba


LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/55858d7b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/55858d7b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/55858d7b

Branch: refs/heads/master
Commit: 55858d7ba72f857ded79035430855e511a8e319d
Parents: 14a7cd1
Author: Jim Ferenczi <ji...@apache.org>
Authored: Wed May 23 16:12:43 2018 +0200
Committer: Jim Ferenczi <ji...@apache.org>
Committed: Wed May 23 16:12:43 2018 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../lucene/analysis/cn/smart/CharType.java      |  5 ++
 .../lucene/analysis/cn/smart/Utility.java       |  4 ++
 .../analysis/cn/smart/hhmm/HHMMSegmenter.java   | 18 +++++--
 .../cn/smart/TestSmartChineseAnalyzer.java      | 53 ++++++++++++++++----
 5 files changed, 68 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/55858d7b/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ba282d1..54a8fba 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -232,6 +232,9 @@ Bug Fixes
 * LUCENE-8328: Ensure ReadersAndUpdates consistently executes under lock.
   (Nhat Nguyen via Simon Willnauer)
 
+* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
+  (chengpohi via Jim Ferenczi)
+
 Other
 
 * LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/55858d7b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
index 4ad5877..d576809 100644
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
@@ -62,4 +62,9 @@ public class CharType {
    */
   public final static int OTHER = 7;
 
+  /**
+   * Surrogate character
+   */
+  public final static int SURROGATE = 8;
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/55858d7b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
index 81ca52e..1d6eeb9 100644
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.cn.smart;
 
 import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
 
+import static java.lang.Character.isSurrogate;
+
 /**
  * SmartChineseAnalyzer utility constants and methods
  * @lucene.experimental
@@ -152,6 +154,8 @@ public class Utility {
    * @see CharType
    */
   public static int getCharType(char ch) {
+    if (isSurrogate(ch))
+      return CharType.SURROGATE;
     // Most (but not all!) of these are Han Ideographic Characters
     if (ch >= 0x4E00 && ch <= 0x9FA5)
       return CharType.HANZI;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/55858d7b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
index bd69190..4d4cd44 100644
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
@@ -21,7 +21,6 @@ import java.util.List;
 import org.apache.lucene.analysis.cn.smart.CharType;
 import org.apache.lucene.analysis.cn.smart.Utility;
 import org.apache.lucene.analysis.cn.smart.WordType;
-import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
 
 /**
  * Finds the optimal segmentation of a sentence into Chinese words
@@ -33,7 +32,7 @@ public class HHMMSegmenter {
 
   /**
    * Create the {@link SegGraph} for a sentence.
-   * 
+   *
    * @param sentence input sentence, without start and end markers
    * @return {@link SegGraph} corresponding to the input sentence.
    */
@@ -57,11 +56,20 @@ public class HHMMSegmenter {
         case CharType.SPACE_LIKE:
           i++;
           break;
+        case CharType.SURROGATE:
+          int state = Character.codePointAt(sentence, i);
+          int count = Character.charCount(state);
+          charArray = new char[count];
+          sentence.getChars(i, i + count, charArray, 0);
+          token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0);
+          segGraph.addToken(token);
+          i += count;
+          break;
         case CharType.HANZI:
           j = i + 1;
           wordBuf.delete(0, wordBuf.length());
-          // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, 
-          // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will 
+          // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
+          // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
           // cause word division.
           wordBuf.append(sentence.charAt(i));
           charArray = new char[] { sentence.charAt(i) };
@@ -175,7 +183,7 @@ public class HHMMSegmenter {
 
   /**
    * Get the character types for every character in a sentence.
-   * 
+   *
    * @see Utility#getCharType(char)
    * @param sentence input sentence
    * @return array of character types corresponding to character positions in the sentence

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/55858d7b/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java b/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
index 6460fbd..93db8a3 100644
--- a/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
+++ b/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
@@ -16,13 +16,16 @@
  */
 package org.apache.lucene.analysis.cn.smart;
 
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.IOUtils;
 
 public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
-  
+
   public void testChineseStopWordsDefault() throws Exception {
     Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
     String sentence = "我购买了道具和服装。";
@@ -46,7 +49,37 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
     assertAnalyzesTo(ca, sentence, result);
     ca.close();
   }
-  
+
+  /*
+   * This test is for test smartcn HHMMSegmenter should correctly handle surrogate character.
+   */
+  public void testSurrogatePairCharacter() throws Exception {
+    Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
+    String sentence =
+        Stream.of(
+                "\uD872\uDF3B",
+                "\uD872\uDF4A",
+                "\uD872\uDF73",
+                "\uD872\uDF5B",
+                "\u9FCF",
+                "\uD86D\uDFFC",
+                "\uD872\uDF2D",
+                "\u9FD4")
+            .collect(Collectors.joining());
+    String result[] = {
+      "\uD872\uDF3B",
+      "\uD872\uDF4A",
+      "\uD872\uDF73",
+      "\uD872\uDF5B",
+      "\u9FCF",
+      "\uD86D\uDFFC",
+      "\uD872\uDF2D",
+      "\u9FD4"
+    };
+    assertAnalyzesTo(ca, sentence, result);
+    ca.close();
+  }
+
   /*
    * This test is the same as the above, except using an ideographic space as a separator.
    * This tests to ensure the stopwords are working correctly.
@@ -166,7 +199,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
       new String[] { "优", "素", "福", "拉", "扎", "吉", "拉", "尼" });
     analyzer.close();
   }
-  
+
   public void testOffsets() throws Exception {
     Analyzer analyzer = new SmartChineseAnalyzer(true);
     assertAnalyzesTo(analyzer, "我购买了道具和服装",
@@ -175,10 +208,10 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
         new int[] { 1, 3, 4, 6, 7, 9 });
     analyzer.close();
   }
-  
+
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new SmartChineseAnalyzer();
-    assertAnalyzesTo(a, "我购买 Tests 了道具和服装", 
+    assertAnalyzesTo(a, "我购买 Tests 了道具和服装",
         new String[] { "我", "购买", "test", "了", "道具", "和", "服装"},
         new int[] { 0, 1, 4, 10, 11, 13, 14 },
         new int[] { 1, 3, 9, 11, 13, 14, 16 });
@@ -188,7 +221,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
         new int[] { 1, 3, 4, 6, 7, 9 });
     a.close();
   }
-  
+
   // LUCENE-3026
   public void testLargeDocument() throws Exception {
     StringBuilder sb = new StringBuilder();
@@ -203,7 +236,7 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
       stream.end();
     }
   }
-  
+
   // LUCENE-3026
   public void testLargeSentence() throws Exception {
     StringBuilder sb = new StringBuilder();
@@ -218,14 +251,14 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
       stream.end();
     }
   }
-  
+
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer analyzer = new SmartChineseAnalyzer();
     checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
     analyzer.close();
   }
-  
+
   /** blast some random large strings through the analyzer */
   public void testRandomHugeStrings() throws Exception {
     Analyzer analyzer = new SmartChineseAnalyzer();