You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2018/02/20 14:05:46 UTC

[1/6] lucene-solr:master: Revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens"

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x d5a01e026 -> 9f02097e2
  refs/heads/master 4bfcbc5c6 -> cc1efdb4a


Revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens"

This reverts commit 972df6c69de494b8a4f59e4e0d4de241d4ca6a80.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/fafbb263
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/fafbb263
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/fafbb263

Branch: refs/heads/master
Commit: fafbb2635d9931f827410fa7fc5cec859bd6c925
Parents: 4bfcbc5
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Feb 20 14:39:53 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 20 14:39:53 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 -
 .../icu/segmentation/BreakIteratorWrapper.java  | 190 +++++++++++++------
 .../segmentation/CompositeBreakIterator.java    |   2 +-
 .../segmentation/DefaultICUTokenizerConfig.java |  18 +-
 .../icu/segmentation/ICUTokenizerConfig.java    |   9 +-
 .../icu/segmentation/ICUTokenizerFactory.java   |   4 +-
 .../icu/segmentation/TestICUTokenizer.java      |  99 +++-------
 .../icu/segmentation/TestICUTokenizerCJK.java   |   9 -
 .../analysis/standard/StandardTokenizer.java    |   5 +-
 9 files changed, 168 insertions(+), 170 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b9d333f..d24a910 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -136,8 +136,6 @@ Improvements
   position sensitive (e.g. part of a phrase) by having an accurate freq.
   (David Smiley)
 
-* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
-
 * LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
   (Ere Maijala)
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
index 9e5050d..d8ecb77 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
@@ -16,84 +16,152 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
+
+import java.text.CharacterIterator;
+
 import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;
-import com.ibm.icu.text.UnicodeSet;
 
 /**
- * Wraps RuleBasedBreakIterator, making object reuse convenient and 
- * emitting a rule status for emoji sequences.
+ * Contain all the issues surrounding BreakIterators in ICU in one place.
+ * Basically this boils down to the fact that they aren't very friendly to any
+ * sort of OO design.
+ * <p>
+ * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
+ * BreakIterator from RuleBasedBreakIterator
+ * <p>
+ * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
+ * doesn't actually behave as a subclass: it always returns 0 for
+ * getRuleStatus(): 
+ * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
+ * tags
  * @lucene.experimental
  */
-final class BreakIteratorWrapper {
-  private final CharArrayIterator textIterator = new CharArrayIterator();
-  private final RuleBasedBreakIterator rbbi;
-  private char text[];
-  private int start;
-  private int status;
-  
-  BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
-    this.rbbi = rbbi;
-  }
-  
-  int current() {
-    return rbbi.current();
-  }
+abstract class BreakIteratorWrapper {
+  protected final CharArrayIterator textIterator = new CharArrayIterator();
+  protected char text[];
+  protected int start;
+  protected int length;
+
+  abstract int next();
+  abstract int current();
+  abstract int getRuleStatus();
+  abstract void setText(CharacterIterator text);
 
-  int getRuleStatus() {
-    return status;
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    this.length = length;
+    textIterator.setText(text, start, length);
+    setText(textIterator);
   }
 
-  int next() {
-    int current = rbbi.current();
-    int next = rbbi.next();
-    status = calcStatus(current, next);
-    return next;
+  /**
+   * If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
+   * any other BreakIterator, the rulestatus method is not available, so treat
+   * it like a generic BreakIterator.
+   */
+  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
+    if (breakIterator instanceof RuleBasedBreakIterator)
+      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
+    else
+      return new BIWrapper(breakIterator);
   }
-  
-  /** Returns current rule status for the text between breaks. (determines token type) */
-  private int calcStatus(int current, int next) {
-    // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
-    // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
-    if (next != BreakIterator.DONE && isEmoji(current, next)) {
-      return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
-    } else {
+
+  /**
+   * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
+   * a DictionaryBasedBreakIterator) behaves correctly.
+   */
+  static final class RBBIWrapper extends BreakIteratorWrapper {
+    private final RuleBasedBreakIterator rbbi;
+
+    RBBIWrapper(RuleBasedBreakIterator rbbi) {
+      this.rbbi = rbbi;
+    }
+
+    @Override
+    int current() {
+      return rbbi.current();
+    }
+
+    @Override
+    int getRuleStatus() {
       return rbbi.getRuleStatus();
     }
+
+    @Override
+    int next() {
+      return rbbi.next();
+    }
+
+    @Override
+    void setText(CharacterIterator text) {
+      rbbi.setText(text);
+    }
   }
-  
-  // See unicode doc L2/16-315 and also the RBBI rules for rationale.
-  // we don't include regional indicators here, because they aren't ambiguous for tagging,
-  // they need only be treated special for segmentation.
-  static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
-
-  /** Returns true if the current text represents emoji character or sequence */
-  private boolean isEmoji(int current, int next) {
-    int begin = start + current;
-    int end = start + next;
-    int codepoint = UTF16.charAt(text, 0, end, begin);
-    // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
-    if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
-      if (EMOJI_RK.contains(codepoint)) {
-        // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
-        // an emoji presentation selector or keycap follows.
-        int trailer = begin + Character.charCount(codepoint);
-        return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
-      } else {
-        return true;
+
+  /**
+   * Generic BreakIterator wrapper: Either the rulestatus method is not
+   * available or always returns 0. Calculate a rulestatus here so it behaves
+   * like RuleBasedBreakIterator.
+   * 
+   * Note: This is slower than RuleBasedBreakIterator.
+   */
+  static final class BIWrapper extends BreakIteratorWrapper {
+    private final BreakIterator bi;
+    private int status;
+
+    BIWrapper(BreakIterator bi) {
+      this.bi = bi;
+    }
+
+    @Override
+    int current() {
+      return bi.current();
+    }
+
+    @Override
+    int getRuleStatus() {
+      return status;
+    }
+
+    @Override
+    int next() {
+      int current = bi.current();
+      int next = bi.next();
+      status = calcStatus(current, next);
+      return next;
+    }
+
+    private int calcStatus(int current, int next) {
+      if (current == BreakIterator.DONE || next == BreakIterator.DONE)
+        return RuleBasedBreakIterator.WORD_NONE;
+
+      int begin = start + current;
+      int end = start + next;
+
+      int codepoint;
+      for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
+        codepoint = UTF16.charAt(text, 0, end, begin);
+
+        if (UCharacter.isDigit(codepoint))
+          return RuleBasedBreakIterator.WORD_NUMBER;
+        else if (UCharacter.isLetter(codepoint)) {
+          // TODO: try to separately specify ideographic, kana? 
+          // [currently all bundled as letter for this case]
+          return RuleBasedBreakIterator.WORD_LETTER;
+        }
       }
+
+      return RuleBasedBreakIterator.WORD_NONE;
     }
-    return false;
-  }
 
-  void setText(char text[], int start, int length) {
-    this.text = text;
-    this.start = start;
-    textIterator.setText(text, start, length);
-    rbbi.setText(textIterator);
-    status = RuleBasedBreakIterator.WORD_NONE;
+    @Override
+    void setText(CharacterIterator text) {
+      bi.setText(text);
+      status = RuleBasedBreakIterator.WORD_NONE;
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
index 3cb39ed..096eada 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
@@ -123,7 +123,7 @@ final class CompositeBreakIterator {
   
   private BreakIteratorWrapper getBreakIterator(int scriptCode) {
     if (wordBreakers[scriptCode] == null)
-      wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
+      wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
     return wordBreakers[scriptCode];
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
index 10e6c67..50a6b4c 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@@ -52,8 +52,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
   /** Token type for words that appear to be numbers */
   public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
-  /** Token type for words that appear to be emoji sequences */
-  public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
   
   /*
    * the default breakiterators in use. these can be expensive to
@@ -67,9 +65,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
 
   // the same as ROOT, except no dictionary segmentation for cjk
-  private static final RuleBasedBreakIterator defaultBreakIterator = 
+  private static final BreakIterator defaultBreakIterator = 
     readBreakIterator("Default.brk");
-  private static final RuleBasedBreakIterator myanmarSyllableIterator = 
+  private static final BreakIterator myanmarSyllableIterator = 
     readBreakIterator("MyanmarSyllable.brk");
   
   // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
@@ -97,16 +95,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   }
 
   @Override
-  public RuleBasedBreakIterator getBreakIterator(int script) {
+  public BreakIterator getBreakIterator(int script) {
     switch(script) {
-      case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
+      case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
       case UScript.MYANMAR: 
         if (myanmarAsWords) {
-          return (RuleBasedBreakIterator)defaultBreakIterator.clone();
+          return (BreakIterator)defaultBreakIterator.clone();
         } else {
-          return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
+          return (BreakIterator)myanmarSyllableIterator.clone();
         }
-      default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
+      default: return (BreakIterator)defaultBreakIterator.clone();
     }
   }
 
@@ -121,8 +119,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
         return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
       case RuleBasedBreakIterator.WORD_NUMBER:
         return WORD_NUMBER;
-      case EMOJI_SEQUENCE_STATUS:
-        return WORD_EMOJI;
       default: /* some other custom code */
         return "<OTHER>";
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
index e2d3dae..69694fc 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
@@ -16,7 +16,8 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
-import com.ibm.icu.text.RuleBasedBreakIterator;
+
+import com.ibm.icu.text.BreakIterator;
 
 /**
  * Class that allows for tailored Unicode Text Segmentation on
@@ -24,16 +25,14 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
  * @lucene.experimental
  */
 public abstract class ICUTokenizerConfig {
-  /** Rule status for emoji sequences */
-  public static final int EMOJI_SEQUENCE_STATUS = 299;
-
+  
   /**
    * Sole constructor. (For invocation by subclass 
    * constructors, typically implicit.)
    */
   public ICUTokenizerConfig() {}
   /** Return a breakiterator capable of processing a given script. */
-  public abstract RuleBasedBreakIterator getBreakIterator(int script);
+  public abstract BreakIterator getBreakIterator(int script);
   /** Return a token type value for a given script and BreakIterator
    *  rule status. */
   public abstract String getType(int script, int ruleStatus);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
index 0cd4cf2..4d29b0c 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
@@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
       config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
         
         @Override
-        public RuleBasedBreakIterator getBreakIterator(int script) {
+        public BreakIterator getBreakIterator(int script) {
           if (breakers[script] != null) {
-            return (RuleBasedBreakIterator) breakers[script].clone();
+            return (BreakIterator) breakers[script].clone();
           } else {
             return super.getBreakIterator(script);
           }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
index 9893975..027baa3 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@@ -16,10 +16,13 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
 
 import com.ibm.icu.lang.UScript;
@@ -73,7 +76,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
-        return new TokenStreamComponents(tokenizer);
+        TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
+        return new TokenStreamComponents(tokenizer, filter);
       }
     };
   }
@@ -86,8 +90,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
 
   public void testArmenian() throws Exception {
     assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
-        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
-        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
+        new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
+        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
   }
   
   public void testAmharic() throws Exception {
@@ -98,12 +102,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   public void testArabic() throws Exception {
     assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
         new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
-        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); 
+        "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); 
   }
   
   public void testAramaic() throws Exception {
     assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
-        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
         "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
   }
   
@@ -121,7 +125,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   
   public void testGreek() throws Exception {
     assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
-        new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+        new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
         "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
   }
   
@@ -152,7 +156,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
    */
   public void testChinese() throws Exception {
     assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
-        new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
+        new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
   }
   
   public void testHebrew() throws Exception {
@@ -182,8 +186,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   /* Tests from StandardAnalyzer, just to show behavior is similar */
   public void testAlphanumericSA() throws Exception {
     // alphanumeric tokens
-    assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
-    assertAnalyzesTo(a, "2B", new String[]{"2B"});
+    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+    assertAnalyzesTo(a, "2B", new String[]{"2b"});
   }
 
   public void testDelimitersSA() throws Exception {
@@ -195,34 +199,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
 
   public void testApostrophesSA() throws Exception {
     // internal apostrophes: O'Reilly, you're, O'Reilly's
-    assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
     assertAnalyzesTo(a, "you're", new String[]{"you're"});
     assertAnalyzesTo(a, "she's", new String[]{"she's"});
-    assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
     assertAnalyzesTo(a, "don't", new String[]{"don't"});
-    assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
   }
 
   public void testNumericSA() throws Exception {
     // floating point, serial, model numbers, ip addresses, etc.
     // every other segment must have at least one digit
     assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
-    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
   }
 
   public void testTextWithNumbersSA() throws Exception {
     // numbers
-    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
   }
 
   public void testVariousTextSA() throws Exception {
     // various
-    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
-    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
-    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
-    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
   }
 
   public void testKoreanSA() throws Exception {
@@ -238,14 +242,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   
   public void testOffsets() throws Exception {
     assertAnalyzesTo(a, "David has 5000 bones", 
-        new String[] {"David", "has", "5000", "bones"},
+        new String[] {"david", "has", "5000", "bones"},
         new int[] {0, 6, 10, 15},
         new int[] {5, 9, 14, 20});
   }
   
   public void testTypes() throws Exception {
     assertAnalyzesTo(a, "David has 5000 bones", 
-        new String[] {"David", "has", "5000", "bones"},
+        new String[] {"david", "has", "5000", "bones"},
         new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
   }
   
@@ -261,61 +265,6 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
   }
   
-  /** simple emoji */
-  public void testEmoji() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
-        new String[] { "💩", "💩", "💩" },
-        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
-  }
- 
-  /** emoji zwj sequence */
-  public void testEmojiSequence() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
-        new String[] { "👩‍❤️‍👩" },
-        new String[] { "<EMOJI>" });
-  }
-  
-  /** emoji zwj sequence with fitzpatrick modifier */
-  public void testEmojiSequenceWithModifier() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
-        new String[] { "👨🏼‍⚕️" },
-        new String[] { "<EMOJI>" });
-  }
-  
-  /** regional indicator */
-  public void testEmojiRegionalIndicator() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
-        new String[] { "🇺🇸", "🇺🇸" },
-        new String[] { "<EMOJI>", "<EMOJI>" });
-  }
-  
-  /** variation sequence */
-  public void testEmojiVariationSequence() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
-        new String[] { "#️⃣" },
-        new String[] { "<EMOJI>" });
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
-        new String[] { "3️⃣",},
-        new String[] { "<EMOJI>" });
-  }
-
-  public void testEmojiTagSequence() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
-        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
-        new String[] { "<EMOJI>" });
-  }
-  
-  public void testEmojiTokenization() throws Exception {
-    // simple emoji around latin
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
-        new String[] { "poo", "💩", "poo" },
-        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
-    // simple emoji around non-latin
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
-        new String[] { "💩", "中", "國", "💩" },
-        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
-  }
-  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
index d93a810..75481f1 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
@@ -78,15 +78,6 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
     );
   }
   
-  /**
-   * dictionary segmentation with emoji
-   */
-  public void testSimpleJapaneseWithEmoji() throws Exception {
-    assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
-        new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩"  }
-    );
-  }
-  
   public void testJapaneseTypes() throws Exception {
     assertAnalyzesTo(a, "仮名遣い カタカナ",
         new String[] { "仮名遣い", "カタカナ" },

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fafbb263/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index 50d1f9f..0410124 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -54,8 +54,6 @@ public final class StandardTokenizer extends Tokenizer {
   public static final int KATAKANA = 5;
   /** Hangul token type */
   public static final int HANGUL = 6;
-  /** Emoji token type. */
-  public static final int EMOJI = 7;
   
   /** String token types that correspond to token type int constants */
   public static final String [] TOKEN_TYPES = new String [] {
@@ -65,8 +63,7 @@ public final class StandardTokenizer extends Tokenizer {
     "<IDEOGRAPHIC>",
     "<HIRAGANA>",
     "<KATAKANA>",
-    "<HANGUL>",
-    "<EMOJI>"
+    "<HANGUL>"
   };
   
   /** Absolute maximum sized token */


[6/6] lucene-solr:branch_7x: Revert "LUCENE-8122: Upgrade analysis/icu to ICU 60.2"

Posted by jp...@apache.org.
Revert "LUCENE-8122: Upgrade analysis/icu to ICU 60.2"

This reverts commit 96be7b432ebd4b9fd8c2efa1b037743a376a05ec.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/9f02097e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/9f02097e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/9f02097e

Branch: refs/heads/branch_7x
Commit: 9f02097e2880bf51783029dfec92d763fcb69148
Parents: 5ad93d4
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Feb 20 14:43:23 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 20 14:43:23 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 -
 lucene/analysis/icu/src/data/uax29/Default.rbbi |  96 ++++---------------
 .../icu/src/data/utr30/DiacriticFolding.txt     |  11 +--
 .../icu/src/data/utr30/NativeDigitFolding.txt   |  10 --
 lucene/analysis/icu/src/data/utr30/nfc.txt      |  13 +--
 lucene/analysis/icu/src/data/utr30/nfkc.txt     |   4 +-
 lucene/analysis/icu/src/data/utr30/nfkc_cf.txt  |  10 +-
 .../analysis/icu/segmentation/ICUTokenizer.java |   8 +-
 lucene/analysis/icu/src/java/overview.html      |   2 +-
 .../analysis/icu/segmentation/Default.brk       | Bin 54488 -> 36768 bytes
 .../icu/segmentation/MyanmarSyllable.brk        | Bin 21976 -> 20744 bytes
 .../org/apache/lucene/analysis/icu/utr30.nrm    | Bin 59056 -> 55184 bytes
 .../analysis/icu/GenerateUTR30DataFiles.java    |   6 +-
 lucene/ivy-versions.properties                  |   2 +-
 lucene/licenses/icu4j-59.1.jar.sha1             |   1 +
 lucene/licenses/icu4j-60.2.jar.sha1             |   1 -
 solr/licenses/icu4j-59.1.jar.sha1               |   1 +
 solr/licenses/icu4j-60.2.jar.sha1               |   1 -
 18 files changed, 40 insertions(+), 128 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8211cbb..06fe19b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -99,8 +99,6 @@ Other
 * LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.
   (Kai Chan via Adrien Grand)
 
-* LUCENE-8122: Upgrade analysis/icu to ICU 60.2. (Robert Muir)
-
 * LUCENE-8106: Add script (reproduceJenkinsFailures.py) to attempt to reproduce
   failing tests from a Jenkins log. (Steve Rowe)
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/data/uax29/Default.rbbi
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/uax29/Default.rbbi b/lucene/analysis/icu/src/data/uax29/Default.rbbi
index afda68f..6c6d1f9 100644
--- a/lucene/analysis/icu/src/data/uax29/Default.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Default.rbbi
@@ -14,21 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
-# and status code change related to that)
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
 #
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (C) 2002-2016, International Business Machines Corporation
+# Copyright (C) 2002-2013, International Business Machines Corporation 
 # and others. All Rights Reserved.
 #
 # file:  word.txt
 #
 # ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.
@@ -40,7 +35,6 @@
 ##############################################################################
 
 !!chain;
-!!quoted_literals_only;
 
 
 #
@@ -49,9 +43,8 @@
 
 $CR                 = [\p{Word_Break = CR}];
 $LF                 = [\p{Word_Break = LF}];
-$Newline            = [\p{Word_Break = Newline} ];
+$Newline            = [\p{Word_Break = Newline}];
 $Extend             = [\p{Word_Break = Extend}];
-$ZWJ                = [\p{Word_Break = ZWJ}];
 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
 $Format             = [\p{Word_Break = Format}];
 $Katakana           = [\p{Word_Break = Katakana}];
@@ -64,13 +57,6 @@ $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
-$E_Base             = [\p{Word_Break = EB}];
-$E_Modifier         = [\p{Word_Break = EM}];
-
-# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
-$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U0000
 2605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U
 0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
-$EBG                = [\p{Word_Break = EBG}];
-$EmojiNRK           = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
 
 $Han                = [:Han:];
 $Hiragana           = [:Hiragana:];
@@ -97,21 +83,21 @@ $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 #             except when they appear at the beginning of a region of text.
 #
 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
-$KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
-$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
-$ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
-$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
-$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
-$MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
-$MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
-$MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
-$NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
-$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
-$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
+$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format)*;
+$NumericEx            = $Numeric            ($Extend |  $Format)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
 
 $Ideographic    = [\p{Ideographic}];
-$HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
-$IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
+$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
+$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
 
 ## -------------------------------------------------
 
@@ -122,17 +108,12 @@ $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
 #
 $CR $LF;
 
-# Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
-#
-$ZWJ ($Extended_Pict | $EmojiNRK);
-
-
 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
 #          format char(s), or is not a CJK dictionary character.
-[^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
+[^$CR $LF $Newline]? ($Extend |  $Format)+;
 
 $NumericEx {100};
 $ALetterEx {200};
@@ -142,10 +123,6 @@ $KatakanaEx {300};       # note:  these status values override those from rule 5
 $HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #
 
-$E_Base ($Extend | $Format | $ZWJ)*;
-$E_Modifier ($Extend | $Format | $ZWJ)*;
-$Extended_Pict ($Extend | $Format | $ZWJ)*;
-
 #
 # rule 5
 #    Do not break between most letters.
@@ -193,42 +170,9 @@ $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
 $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
 $ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
 
-# rule 14
-#    Do not break within emoji modifier sequences
-
-($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
+# rule 13c
 
-# rules 15 - 17
-#    Pairs of Regional Indicators stay together.
-#    With rule chaining disabled by ^, this rule will match exactly two of them.
-#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
-#
-^$Regional_IndicatorEx $Regional_IndicatorEx;
+$Regional_IndicatorEx $Regional_IndicatorEx;
 
 # special handling for CJK characters: chain for later dictionary segmentation
 $HangulSyllable $HangulSyllable {200};
-
-# Rule 999
-#     Match a single code point if no other rule applies.
-.;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
index 806a4f9..eb5b78e 100644
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@@ -73,14 +73,12 @@
 0A4D>
 0ABC>
 0ACD>
-0AFD..0AFF>
 0B3C>
 0B4D>
 0BCD>
 0C4D>
 0CBC>
 0CCD>
-0D3B..0D3C>
 0D4D>
 0DCA>
 0E47..0E4C>
@@ -114,10 +112,10 @@
 1CD0..1CE8>
 1CED>
 1CF4>
-1CF7..1CF9>
+1CF8..1CF9>
 1D2C..1D6A>
 1DC4..1DCF>
-1DF5..1DF9>
+1DF5>
 1DFD..1DFF>
 1FBD>
 1FBF..1FC1>
@@ -177,12 +175,7 @@ FFE3>
 1163F>
 116B6..116B7>
 1172B>
-11A34>
-11A47>
-11A99>
 11C3F>
-11D42>
-11D44..11D45>
 16AF0..16AF4>
 16F8F..16F9F>
 1D167..1D169>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
index 707674e..fb8cf1a 100644
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@@ -580,16 +580,6 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 11C57>0037   # BHAIKSUKI DIGIT SEVEN
 11C58>0038   # BHAIKSUKI DIGIT EIGHT
 11C59>0039   # BHAIKSUKI DIGIT NINE
-11D50>0030   # MASARAM GONDI DIGIT ZERO
-11D51>0031   # MASARAM GONDI DIGIT ONE
-11D52>0032   # MASARAM GONDI DIGIT TWO
-11D53>0033   # MASARAM GONDI DIGIT THREE
-11D54>0034   # MASARAM GONDI DIGIT FOUR
-11D55>0035   # MASARAM GONDI DIGIT FIVE
-11D56>0036   # MASARAM GONDI DIGIT SIX
-11D57>0037   # MASARAM GONDI DIGIT SEVEN
-11D58>0038   # MASARAM GONDI DIGIT EIGHT
-11D59>0039   # MASARAM GONDI DIGIT NINE
 16A60>0030   # MRO DIGIT ZERO
 16A61>0031   # MRO DIGIT ONE
 16A62>0032   # MRO DIGIT TWO

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/data/utr30/nfc.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/nfc.txt b/lucene/analysis/icu/src/data/utr30/nfc.txt
index b41056d..5f9b182 100644
--- a/lucene/analysis/icu/src/data/utr30/nfc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfc.txt
@@ -1,5 +1,3 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
 # Copyright (C) 1999-2016, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@@ -9,7 +7,7 @@
 #
 # Complete data for Unicode NFC normalization.
 
-* Unicode 10.0.0
+* Unicode 9.0.0
 
 # Canonical_Combining_Class (ccc) values
 0300..0314:230
@@ -166,7 +164,6 @@
 0C56:91
 0CBC:7
 0CCD:9
-0D3B..0D3C:9
 0D4D:9
 0DCA:9
 0E38..0E39:103
@@ -237,9 +234,6 @@
 1DCF:220
 1DD0:202
 1DD1..1DF5:230
-1DF6:232
-1DF7..1DF8:228
-1DF9:220
 1DFB:230
 1DFC:233
 1DFD:220
@@ -328,12 +322,7 @@ FE2E..FE2F:230
 116B6:9
 116B7:7
 1172B:9
-11A34:9
-11A47:9
-11A99:9
 11C3F:9
-11D42:7
-11D44..11D45:9
 16AF0..16AF4:1
 16B30..16B36:230
 1BC9E:1

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/data/utr30/nfkc.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/nfkc.txt b/lucene/analysis/icu/src/data/utr30/nfkc.txt
index 8b71727..f51fa5d 100644
--- a/lucene/analysis/icu/src/data/utr30/nfkc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc.txt
@@ -1,5 +1,3 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
 # Copyright (C) 1999-2016, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@@ -13,7 +11,7 @@
 # to NFKC one-way mappings.
 # Use this file as the second gennorm2 input file after nfc.txt.
 
-* Unicode 10.0.0
+* Unicode 9.0.0
 
 00A0>0020
 00A8>0020 0308

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
index 726c5b5..7f33df5 100644
--- a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
@@ -1,7 +1,7 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (C) 1999-2016, International Business Machines
-# Corporation and others.  All Rights Reserved.
+# Unicode Character Database
+# Copyright (c) 1991-2016 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For documentation, see http://www.unicode.org/reports/tr44/
 #
 # file name: nfkc_cf.txt
 #
@@ -12,7 +12,7 @@
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
 # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
 
-* Unicode 10.0.0
+* Unicode 9.0.0
 
 0041>0061
 0042>0062

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
index 8b62ddb..0941551 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer {
    */
   private boolean incrementTokenBuffer() {
     int start = breaker.current();
-    assert start != BreakIterator.DONE;
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted
 
     // find the next set of boundaries, skipping over non-tokens (rule status 0)
     int end = breaker.next();
-    while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
+    while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
       start = end;
       end = breaker.next();
     }
 
-    if (end == BreakIterator.DONE) {
+    if (start == BreakIterator.DONE)
       return false; // BreakIterator exhausted
-    }
 
     termAtt.copyBuffer(buffer, start, end - start);
     offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html
index 6fa5821..bdace97 100644
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 10.0). However, some users who wish
+the most recent version of Unicode (currently 8.0). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
index 4a9df15..c94a023 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
index a9d0673..c3357ef 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
index 1c3de12..1a16f3e 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
index 042fa37..0f2bffe 100644
--- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
+++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
@@ -62,9 +62,9 @@ import java.util.regex.Pattern;
  */
 public class GenerateUTR30DataFiles {
   private static final String ICU_SVN_TAG_URL
-      = "http://source.icu-project.org/repos/icu/tags";
-  private static final String ICU_RELEASE_TAG = "release-60-2";
-  private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
+      = "http://source.icu-project.org/repos/icu/icu/tags";
+  private static final String ICU_RELEASE_TAG = "release-58-1";
+  private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
   private static final String NFC_TXT = "nfc.txt";
   private static final String NFKC_TXT = "nfkc.txt";
   private static final String NFKC_CF_TXT = "nfkc_cf.txt";

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/ivy-versions.properties
----------------------------------------------------------------------
diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
index a37e306..5ab36dd 100644
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@@ -31,7 +31,7 @@ com.fasterxml.jackson.core.version = 2.5.4
 /com.googlecode.mp4parser/isoparser = 1.1.18
 /com.healthmarketscience.jackcess/jackcess = 2.1.8
 /com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
-/com.ibm.icu/icu4j = 60.2
+/com.ibm.icu/icu4j = 59.1
 /com.pff/java-libpst = 0.8.1
 
 com.rometools.version = 1.5.1

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/licenses/icu4j-59.1.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-59.1.jar.sha1 b/lucene/licenses/icu4j-59.1.jar.sha1
new file mode 100644
index 0000000..f3f0018
--- /dev/null
+++ b/lucene/licenses/icu4j-59.1.jar.sha1
@@ -0,0 +1 @@
+6f06e820cf4c8968bbbaae66ae0b33f6a256b57f

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/lucene/licenses/icu4j-60.2.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-60.2.jar.sha1 b/lucene/licenses/icu4j-60.2.jar.sha1
deleted file mode 100644
index e613111..0000000
--- a/lucene/licenses/icu4j-60.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-e452cba3caaf93b997ff543c7246a6da74ed70f1

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/solr/licenses/icu4j-59.1.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-59.1.jar.sha1 b/solr/licenses/icu4j-59.1.jar.sha1
new file mode 100644
index 0000000..f3f0018
--- /dev/null
+++ b/solr/licenses/icu4j-59.1.jar.sha1
@@ -0,0 +1 @@
+6f06e820cf4c8968bbbaae66ae0b33f6a256b57f

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9f02097e/solr/licenses/icu4j-60.2.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-60.2.jar.sha1 b/solr/licenses/icu4j-60.2.jar.sha1
deleted file mode 100644
index e613111..0000000
--- a/solr/licenses/icu4j-60.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-e452cba3caaf93b997ff543c7246a6da74ed70f1


[3/6] lucene-solr:master: Revert "LUCENE-8122: Upgrade analysis/icu to ICU 60.2"

Posted by jp...@apache.org.
Revert "LUCENE-8122: Upgrade analysis/icu to ICU 60.2"

This reverts commit 07407a5b53bf4d790c316ecf3b71046242f1e2da.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cc1efdb4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cc1efdb4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cc1efdb4

Branch: refs/heads/master
Commit: cc1efdb4a345de6e57b4da5d4210c1fe54531022
Parents: 9a7b56b
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Feb 20 14:40:53 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 20 14:40:53 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 -
 lucene/analysis/icu/src/data/uax29/Default.rbbi |  96 ++++---------------
 .../icu/src/data/utr30/DiacriticFolding.txt     |  11 +--
 .../icu/src/data/utr30/NativeDigitFolding.txt   |  10 --
 lucene/analysis/icu/src/data/utr30/nfc.txt      |  13 +--
 lucene/analysis/icu/src/data/utr30/nfkc.txt     |   4 +-
 lucene/analysis/icu/src/data/utr30/nfkc_cf.txt  |  10 +-
 .../analysis/icu/segmentation/ICUTokenizer.java |   8 +-
 lucene/analysis/icu/src/java/overview.html      |   2 +-
 .../analysis/icu/segmentation/Default.brk       | Bin 54488 -> 36768 bytes
 .../icu/segmentation/MyanmarSyllable.brk        | Bin 21976 -> 20744 bytes
 .../org/apache/lucene/analysis/icu/utr30.nrm    | Bin 59056 -> 55184 bytes
 .../analysis/icu/GenerateUTR30DataFiles.java    |   6 +-
 lucene/ivy-versions.properties                  |   2 +-
 lucene/licenses/icu4j-59.1.jar.sha1             |   1 +
 lucene/licenses/icu4j-60.2.jar.sha1             |   1 -
 solr/licenses/icu4j-59.1.jar.sha1               |   1 +
 solr/licenses/icu4j-60.2.jar.sha1               |   1 -
 18 files changed, 40 insertions(+), 128 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index d24a910..e3799bc 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -186,8 +186,6 @@ Other
 * LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.
   (Kai Chan via Adrien Grand)
 
-* LUCENE-8122: Upgrade analysis/icu to ICU 60.2. (Robert Muir)
-
 * LUCENE-8106: Add script (reproduceJenkinsFailures.py) to attempt to reproduce
   failing tests from a Jenkins log. (Steve Rowe)
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/data/uax29/Default.rbbi
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/uax29/Default.rbbi b/lucene/analysis/icu/src/data/uax29/Default.rbbi
index afda68f..6c6d1f9 100644
--- a/lucene/analysis/icu/src/data/uax29/Default.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Default.rbbi
@@ -14,21 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
-# and status code change related to that)
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
 #
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (C) 2002-2016, International Business Machines Corporation
+# Copyright (C) 2002-2013, International Business Machines Corporation 
 # and others. All Rights Reserved.
 #
 # file:  word.txt
 #
 # ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.
@@ -40,7 +35,6 @@
 ##############################################################################
 
 !!chain;
-!!quoted_literals_only;
 
 
 #
@@ -49,9 +43,8 @@
 
 $CR                 = [\p{Word_Break = CR}];
 $LF                 = [\p{Word_Break = LF}];
-$Newline            = [\p{Word_Break = Newline} ];
+$Newline            = [\p{Word_Break = Newline}];
 $Extend             = [\p{Word_Break = Extend}];
-$ZWJ                = [\p{Word_Break = ZWJ}];
 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
 $Format             = [\p{Word_Break = Format}];
 $Katakana           = [\p{Word_Break = Katakana}];
@@ -64,13 +57,6 @@ $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
-$E_Base             = [\p{Word_Break = EB}];
-$E_Modifier         = [\p{Word_Break = EM}];
-
-# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
-$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U0000
 2605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U
 0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
-$EBG                = [\p{Word_Break = EBG}];
-$EmojiNRK           = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
 
 $Han                = [:Han:];
 $Hiragana           = [:Hiragana:];
@@ -97,21 +83,21 @@ $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 #             except when they appear at the beginning of a region of text.
 #
 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
-$KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
-$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
-$ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
-$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
-$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
-$MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
-$MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
-$MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
-$NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
-$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
-$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
+$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format)*;
+$NumericEx            = $Numeric            ($Extend |  $Format)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
 
 $Ideographic    = [\p{Ideographic}];
-$HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
-$IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
+$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
+$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
 
 ## -------------------------------------------------
 
@@ -122,17 +108,12 @@ $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
 #
 $CR $LF;
 
-# Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
-#
-$ZWJ ($Extended_Pict | $EmojiNRK);
-
-
 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
 #          format char(s), or is not a CJK dictionary character.
-[^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
+[^$CR $LF $Newline]? ($Extend |  $Format)+;
 
 $NumericEx {100};
 $ALetterEx {200};
@@ -142,10 +123,6 @@ $KatakanaEx {300};       # note:  these status values override those from rule 5
 $HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #
 
-$E_Base ($Extend | $Format | $ZWJ)*;
-$E_Modifier ($Extend | $Format | $ZWJ)*;
-$Extended_Pict ($Extend | $Format | $ZWJ)*;
-
 #
 # rule 5
 #    Do not break between most letters.
@@ -193,42 +170,9 @@ $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
 $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
 $ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
 
-# rule 14
-#    Do not break within emoji modifier sequences
-
-($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
+# rule 13c
 
-# rules 15 - 17
-#    Pairs of Regional Indicators stay together.
-#    With rule chaining disabled by ^, this rule will match exactly two of them.
-#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
-#
-^$Regional_IndicatorEx $Regional_IndicatorEx;
+$Regional_IndicatorEx $Regional_IndicatorEx;
 
 # special handling for CJK characters: chain for later dictionary segmentation
 $HangulSyllable $HangulSyllable {200};
-
-# Rule 999
-#     Match a single code point if no other rule applies.
-.;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
index 806a4f9..eb5b78e 100644
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@@ -73,14 +73,12 @@
 0A4D>
 0ABC>
 0ACD>
-0AFD..0AFF>
 0B3C>
 0B4D>
 0BCD>
 0C4D>
 0CBC>
 0CCD>
-0D3B..0D3C>
 0D4D>
 0DCA>
 0E47..0E4C>
@@ -114,10 +112,10 @@
 1CD0..1CE8>
 1CED>
 1CF4>
-1CF7..1CF9>
+1CF8..1CF9>
 1D2C..1D6A>
 1DC4..1DCF>
-1DF5..1DF9>
+1DF5>
 1DFD..1DFF>
 1FBD>
 1FBF..1FC1>
@@ -177,12 +175,7 @@ FFE3>
 1163F>
 116B6..116B7>
 1172B>
-11A34>
-11A47>
-11A99>
 11C3F>
-11D42>
-11D44..11D45>
 16AF0..16AF4>
 16F8F..16F9F>
 1D167..1D169>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
index 707674e..fb8cf1a 100644
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@@ -580,16 +580,6 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 11C57>0037   # BHAIKSUKI DIGIT SEVEN
 11C58>0038   # BHAIKSUKI DIGIT EIGHT
 11C59>0039   # BHAIKSUKI DIGIT NINE
-11D50>0030   # MASARAM GONDI DIGIT ZERO
-11D51>0031   # MASARAM GONDI DIGIT ONE
-11D52>0032   # MASARAM GONDI DIGIT TWO
-11D53>0033   # MASARAM GONDI DIGIT THREE
-11D54>0034   # MASARAM GONDI DIGIT FOUR
-11D55>0035   # MASARAM GONDI DIGIT FIVE
-11D56>0036   # MASARAM GONDI DIGIT SIX
-11D57>0037   # MASARAM GONDI DIGIT SEVEN
-11D58>0038   # MASARAM GONDI DIGIT EIGHT
-11D59>0039   # MASARAM GONDI DIGIT NINE
 16A60>0030   # MRO DIGIT ZERO
 16A61>0031   # MRO DIGIT ONE
 16A62>0032   # MRO DIGIT TWO

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/data/utr30/nfc.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/nfc.txt b/lucene/analysis/icu/src/data/utr30/nfc.txt
index b41056d..5f9b182 100644
--- a/lucene/analysis/icu/src/data/utr30/nfc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfc.txt
@@ -1,5 +1,3 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
 # Copyright (C) 1999-2016, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@@ -9,7 +7,7 @@
 #
 # Complete data for Unicode NFC normalization.
 
-* Unicode 10.0.0
+* Unicode 9.0.0
 
 # Canonical_Combining_Class (ccc) values
 0300..0314:230
@@ -166,7 +164,6 @@
 0C56:91
 0CBC:7
 0CCD:9
-0D3B..0D3C:9
 0D4D:9
 0DCA:9
 0E38..0E39:103
@@ -237,9 +234,6 @@
 1DCF:220
 1DD0:202
 1DD1..1DF5:230
-1DF6:232
-1DF7..1DF8:228
-1DF9:220
 1DFB:230
 1DFC:233
 1DFD:220
@@ -328,12 +322,7 @@ FE2E..FE2F:230
 116B6:9
 116B7:7
 1172B:9
-11A34:9
-11A47:9
-11A99:9
 11C3F:9
-11D42:7
-11D44..11D45:9
 16AF0..16AF4:1
 16B30..16B36:230
 1BC9E:1

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/data/utr30/nfkc.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/nfkc.txt b/lucene/analysis/icu/src/data/utr30/nfkc.txt
index 8b71727..f51fa5d 100644
--- a/lucene/analysis/icu/src/data/utr30/nfkc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc.txt
@@ -1,5 +1,3 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
 # Copyright (C) 1999-2016, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@@ -13,7 +11,7 @@
 # to NFKC one-way mappings.
 # Use this file as the second gennorm2 input file after nfc.txt.
 
-* Unicode 10.0.0
+* Unicode 9.0.0
 
 00A0>0020
 00A8>0020 0308

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
index 726c5b5..7f33df5 100644
--- a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
@@ -1,7 +1,7 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (C) 1999-2016, International Business Machines
-# Corporation and others.  All Rights Reserved.
+# Unicode Character Database
+# Copyright (c) 1991-2016 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For documentation, see http://www.unicode.org/reports/tr44/
 #
 # file name: nfkc_cf.txt
 #
@@ -12,7 +12,7 @@
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
 # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
 
-* Unicode 10.0.0
+* Unicode 9.0.0
 
 0041>0061
 0042>0062

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
index 8b62ddb..0941551 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer {
    */
   private boolean incrementTokenBuffer() {
     int start = breaker.current();
-    assert start != BreakIterator.DONE;
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted
 
     // find the next set of boundaries, skipping over non-tokens (rule status 0)
     int end = breaker.next();
-    while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
+    while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
       start = end;
       end = breaker.next();
     }
 
-    if (end == BreakIterator.DONE) {
+    if (start == BreakIterator.DONE)
       return false; // BreakIterator exhausted
-    }
 
     termAtt.copyBuffer(buffer, start, end - start);
     offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html
index 6fa5821..bdace97 100644
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 10.0). However, some users who wish
+the most recent version of Unicode (currently 8.0). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
index 4a9df15..c94a023 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
index a9d0673..c3357ef 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
index 1c3de12..1a16f3e 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
index 042fa37..0f2bffe 100644
--- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
+++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
@@ -62,9 +62,9 @@ import java.util.regex.Pattern;
  */
 public class GenerateUTR30DataFiles {
   private static final String ICU_SVN_TAG_URL
-      = "http://source.icu-project.org/repos/icu/tags";
-  private static final String ICU_RELEASE_TAG = "release-60-2";
-  private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
+      = "http://source.icu-project.org/repos/icu/icu/tags";
+  private static final String ICU_RELEASE_TAG = "release-58-1";
+  private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
   private static final String NFC_TXT = "nfc.txt";
   private static final String NFKC_TXT = "nfkc.txt";
   private static final String NFKC_CF_TXT = "nfkc_cf.txt";

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/ivy-versions.properties
----------------------------------------------------------------------
diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
index a37e306..5ab36dd 100644
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@@ -31,7 +31,7 @@ com.fasterxml.jackson.core.version = 2.5.4
 /com.googlecode.mp4parser/isoparser = 1.1.18
 /com.healthmarketscience.jackcess/jackcess = 2.1.8
 /com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
-/com.ibm.icu/icu4j = 60.2
+/com.ibm.icu/icu4j = 59.1
 /com.pff/java-libpst = 0.8.1
 
 com.rometools.version = 1.5.1

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/licenses/icu4j-59.1.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-59.1.jar.sha1 b/lucene/licenses/icu4j-59.1.jar.sha1
new file mode 100644
index 0000000..f3f0018
--- /dev/null
+++ b/lucene/licenses/icu4j-59.1.jar.sha1
@@ -0,0 +1 @@
+6f06e820cf4c8968bbbaae66ae0b33f6a256b57f

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/lucene/licenses/icu4j-60.2.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-60.2.jar.sha1 b/lucene/licenses/icu4j-60.2.jar.sha1
deleted file mode 100644
index e613111..0000000
--- a/lucene/licenses/icu4j-60.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-e452cba3caaf93b997ff543c7246a6da74ed70f1

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/solr/licenses/icu4j-59.1.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-59.1.jar.sha1 b/solr/licenses/icu4j-59.1.jar.sha1
new file mode 100644
index 0000000..f3f0018
--- /dev/null
+++ b/solr/licenses/icu4j-59.1.jar.sha1
@@ -0,0 +1 @@
+6f06e820cf4c8968bbbaae66ae0b33f6a256b57f

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cc1efdb4/solr/licenses/icu4j-60.2.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-60.2.jar.sha1 b/solr/licenses/icu4j-60.2.jar.sha1
deleted file mode 100644
index e613111..0000000
--- a/solr/licenses/icu4j-60.2.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-e452cba3caaf93b997ff543c7246a6da74ed70f1


[4/6] lucene-solr:branch_7x: Revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens"

Posted by jp...@apache.org.
Revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens"

This reverts commit c9916e3048e98371f056b96cdbaa996f1f36a2fa.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/314bcfda
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/314bcfda
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/314bcfda

Branch: refs/heads/branch_7x
Commit: 314bcfda6114147d4ba99f626b688701ba4f75e3
Parents: d5a01e0
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Feb 20 14:42:56 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 20 14:42:56 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 -
 .../icu/segmentation/BreakIteratorWrapper.java  | 190 +++++++++++++------
 .../segmentation/CompositeBreakIterator.java    |   2 +-
 .../segmentation/DefaultICUTokenizerConfig.java |  18 +-
 .../icu/segmentation/ICUTokenizerConfig.java    |   9 +-
 .../icu/segmentation/ICUTokenizerFactory.java   |   4 +-
 .../icu/segmentation/TestICUTokenizer.java      |  99 +++-------
 .../icu/segmentation/TestICUTokenizerCJK.java   |   9 -
 .../analysis/standard/StandardTokenizer.java    |   5 +-
 9 files changed, 168 insertions(+), 170 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bfa7969..8211cbb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -49,8 +49,6 @@ Improvements
   position sensitive (e.g. part of a phrase) by having an accurate freq.
   (David Smiley)
 
-* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
-
 * LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
   (Ere Maijala)
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
index 9e5050d..d8ecb77 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
@@ -16,84 +16,152 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
+
+import java.text.CharacterIterator;
+
 import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;
-import com.ibm.icu.text.UnicodeSet;
 
 /**
- * Wraps RuleBasedBreakIterator, making object reuse convenient and 
- * emitting a rule status for emoji sequences.
+ * Contain all the issues surrounding BreakIterators in ICU in one place.
+ * Basically this boils down to the fact that they aren't very friendly to any
+ * sort of OO design.
+ * <p>
+ * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
+ * BreakIterator from RuleBasedBreakIterator
+ * <p>
+ * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
+ * doesn't actually behave as a subclass: it always returns 0 for
+ * getRuleStatus(): 
+ * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
+ * tags
  * @lucene.experimental
  */
-final class BreakIteratorWrapper {
-  private final CharArrayIterator textIterator = new CharArrayIterator();
-  private final RuleBasedBreakIterator rbbi;
-  private char text[];
-  private int start;
-  private int status;
-  
-  BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
-    this.rbbi = rbbi;
-  }
-  
-  int current() {
-    return rbbi.current();
-  }
+abstract class BreakIteratorWrapper {
+  protected final CharArrayIterator textIterator = new CharArrayIterator();
+  protected char text[];
+  protected int start;
+  protected int length;
+
+  abstract int next();
+  abstract int current();
+  abstract int getRuleStatus();
+  abstract void setText(CharacterIterator text);
 
-  int getRuleStatus() {
-    return status;
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    this.length = length;
+    textIterator.setText(text, start, length);
+    setText(textIterator);
   }
 
-  int next() {
-    int current = rbbi.current();
-    int next = rbbi.next();
-    status = calcStatus(current, next);
-    return next;
+  /**
+   * If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
+   * any other BreakIterator, the rulestatus method is not available, so treat
+   * it like a generic BreakIterator.
+   */
+  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
+    if (breakIterator instanceof RuleBasedBreakIterator)
+      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
+    else
+      return new BIWrapper(breakIterator);
   }
-  
-  /** Returns current rule status for the text between breaks. (determines token type) */
-  private int calcStatus(int current, int next) {
-    // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
-    // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
-    if (next != BreakIterator.DONE && isEmoji(current, next)) {
-      return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
-    } else {
+
+  /**
+   * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
+   * a DictionaryBasedBreakIterator) behaves correctly.
+   */
+  static final class RBBIWrapper extends BreakIteratorWrapper {
+    private final RuleBasedBreakIterator rbbi;
+
+    RBBIWrapper(RuleBasedBreakIterator rbbi) {
+      this.rbbi = rbbi;
+    }
+
+    @Override
+    int current() {
+      return rbbi.current();
+    }
+
+    @Override
+    int getRuleStatus() {
       return rbbi.getRuleStatus();
     }
+
+    @Override
+    int next() {
+      return rbbi.next();
+    }
+
+    @Override
+    void setText(CharacterIterator text) {
+      rbbi.setText(text);
+    }
   }
-  
-  // See unicode doc L2/16-315 and also the RBBI rules for rationale.
-  // we don't include regional indicators here, because they aren't ambiguous for tagging,
-  // they need only be treated special for segmentation.
-  static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
-
-  /** Returns true if the current text represents emoji character or sequence */
-  private boolean isEmoji(int current, int next) {
-    int begin = start + current;
-    int end = start + next;
-    int codepoint = UTF16.charAt(text, 0, end, begin);
-    // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
-    if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
-      if (EMOJI_RK.contains(codepoint)) {
-        // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
-        // an emoji presentation selector or keycap follows.
-        int trailer = begin + Character.charCount(codepoint);
-        return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
-      } else {
-        return true;
+
+  /**
+   * Generic BreakIterator wrapper: Either the rulestatus method is not
+   * available or always returns 0. Calculate a rulestatus here so it behaves
+   * like RuleBasedBreakIterator.
+   * 
+   * Note: This is slower than RuleBasedBreakIterator.
+   */
+  static final class BIWrapper extends BreakIteratorWrapper {
+    private final BreakIterator bi;
+    private int status;
+
+    BIWrapper(BreakIterator bi) {
+      this.bi = bi;
+    }
+
+    @Override
+    int current() {
+      return bi.current();
+    }
+
+    @Override
+    int getRuleStatus() {
+      return status;
+    }
+
+    @Override
+    int next() {
+      int current = bi.current();
+      int next = bi.next();
+      status = calcStatus(current, next);
+      return next;
+    }
+
+    private int calcStatus(int current, int next) {
+      if (current == BreakIterator.DONE || next == BreakIterator.DONE)
+        return RuleBasedBreakIterator.WORD_NONE;
+
+      int begin = start + current;
+      int end = start + next;
+
+      int codepoint;
+      for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
+        codepoint = UTF16.charAt(text, 0, end, begin);
+
+        if (UCharacter.isDigit(codepoint))
+          return RuleBasedBreakIterator.WORD_NUMBER;
+        else if (UCharacter.isLetter(codepoint)) {
+          // TODO: try to separately specify ideographic, kana? 
+          // [currently all bundled as letter for this case]
+          return RuleBasedBreakIterator.WORD_LETTER;
+        }
       }
+
+      return RuleBasedBreakIterator.WORD_NONE;
     }
-    return false;
-  }
 
-  void setText(char text[], int start, int length) {
-    this.text = text;
-    this.start = start;
-    textIterator.setText(text, start, length);
-    rbbi.setText(textIterator);
-    status = RuleBasedBreakIterator.WORD_NONE;
+    @Override
+    void setText(CharacterIterator text) {
+      bi.setText(text);
+      status = RuleBasedBreakIterator.WORD_NONE;
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
index 3cb39ed..096eada 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
@@ -123,7 +123,7 @@ final class CompositeBreakIterator {
   
   private BreakIteratorWrapper getBreakIterator(int scriptCode) {
     if (wordBreakers[scriptCode] == null)
-      wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
+      wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
     return wordBreakers[scriptCode];
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
index 10e6c67..50a6b4c 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@@ -52,8 +52,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
   /** Token type for words that appear to be numbers */
   public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
-  /** Token type for words that appear to be emoji sequences */
-  public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
   
   /*
    * the default breakiterators in use. these can be expensive to
@@ -67,9 +65,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
 
   // the same as ROOT, except no dictionary segmentation for cjk
-  private static final RuleBasedBreakIterator defaultBreakIterator = 
+  private static final BreakIterator defaultBreakIterator = 
     readBreakIterator("Default.brk");
-  private static final RuleBasedBreakIterator myanmarSyllableIterator = 
+  private static final BreakIterator myanmarSyllableIterator = 
     readBreakIterator("MyanmarSyllable.brk");
   
   // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
@@ -97,16 +95,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   }
 
   @Override
-  public RuleBasedBreakIterator getBreakIterator(int script) {
+  public BreakIterator getBreakIterator(int script) {
     switch(script) {
-      case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
+      case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
       case UScript.MYANMAR: 
         if (myanmarAsWords) {
-          return (RuleBasedBreakIterator)defaultBreakIterator.clone();
+          return (BreakIterator)defaultBreakIterator.clone();
         } else {
-          return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
+          return (BreakIterator)myanmarSyllableIterator.clone();
         }
-      default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
+      default: return (BreakIterator)defaultBreakIterator.clone();
     }
   }
 
@@ -121,8 +119,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
         return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
       case RuleBasedBreakIterator.WORD_NUMBER:
         return WORD_NUMBER;
-      case EMOJI_SEQUENCE_STATUS:
-        return WORD_EMOJI;
       default: /* some other custom code */
         return "<OTHER>";
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
index e2d3dae..69694fc 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
@@ -16,7 +16,8 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
-import com.ibm.icu.text.RuleBasedBreakIterator;
+
+import com.ibm.icu.text.BreakIterator;
 
 /**
  * Class that allows for tailored Unicode Text Segmentation on
@@ -24,16 +25,14 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
  * @lucene.experimental
  */
 public abstract class ICUTokenizerConfig {
-  /** Rule status for emoji sequences */
-  public static final int EMOJI_SEQUENCE_STATUS = 299;
-
+  
   /**
    * Sole constructor. (For invocation by subclass 
    * constructors, typically implicit.)
    */
   public ICUTokenizerConfig() {}
   /** Return a breakiterator capable of processing a given script. */
-  public abstract RuleBasedBreakIterator getBreakIterator(int script);
+  public abstract BreakIterator getBreakIterator(int script);
   /** Return a token type value for a given script and BreakIterator
    *  rule status. */
   public abstract String getType(int script, int ruleStatus);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
index 0cd4cf2..4d29b0c 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
@@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
       config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
         
         @Override
-        public RuleBasedBreakIterator getBreakIterator(int script) {
+        public BreakIterator getBreakIterator(int script) {
           if (breakers[script] != null) {
-            return (RuleBasedBreakIterator) breakers[script].clone();
+            return (BreakIterator) breakers[script].clone();
           } else {
             return super.getBreakIterator(script);
           }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
index 9893975..027baa3 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@@ -16,10 +16,13 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
 
 import com.ibm.icu.lang.UScript;
@@ -73,7 +76,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
-        return new TokenStreamComponents(tokenizer);
+        TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
+        return new TokenStreamComponents(tokenizer, filter);
       }
     };
   }
@@ -86,8 +90,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
 
   public void testArmenian() throws Exception {
     assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
-        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
-        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
+        new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
+        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
   }
   
   public void testAmharic() throws Exception {
@@ -98,12 +102,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   public void testArabic() throws Exception {
     assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
         new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
-        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); 
+        "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); 
   }
   
   public void testAramaic() throws Exception {
     assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
-        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
         "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
   }
   
@@ -121,7 +125,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   
   public void testGreek() throws Exception {
     assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
-        new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+        new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
         "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
   }
   
@@ -152,7 +156,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
    */
   public void testChinese() throws Exception {
     assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
-        new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
+        new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
   }
   
   public void testHebrew() throws Exception {
@@ -182,8 +186,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   /* Tests from StandardAnalyzer, just to show behavior is similar */
   public void testAlphanumericSA() throws Exception {
     // alphanumeric tokens
-    assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
-    assertAnalyzesTo(a, "2B", new String[]{"2B"});
+    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+    assertAnalyzesTo(a, "2B", new String[]{"2b"});
   }
 
   public void testDelimitersSA() throws Exception {
@@ -195,34 +199,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
 
   public void testApostrophesSA() throws Exception {
     // internal apostrophes: O'Reilly, you're, O'Reilly's
-    assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
     assertAnalyzesTo(a, "you're", new String[]{"you're"});
     assertAnalyzesTo(a, "she's", new String[]{"she's"});
-    assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
     assertAnalyzesTo(a, "don't", new String[]{"don't"});
-    assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
   }
 
   public void testNumericSA() throws Exception {
     // floating point, serial, model numbers, ip addresses, etc.
     // every other segment must have at least one digit
     assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
-    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
   }
 
   public void testTextWithNumbersSA() throws Exception {
     // numbers
-    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
   }
 
   public void testVariousTextSA() throws Exception {
     // various
-    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
-    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
-    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
-    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
   }
 
   public void testKoreanSA() throws Exception {
@@ -238,14 +242,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   
   public void testOffsets() throws Exception {
     assertAnalyzesTo(a, "David has 5000 bones", 
-        new String[] {"David", "has", "5000", "bones"},
+        new String[] {"david", "has", "5000", "bones"},
         new int[] {0, 6, 10, 15},
         new int[] {5, 9, 14, 20});
   }
   
   public void testTypes() throws Exception {
     assertAnalyzesTo(a, "David has 5000 bones", 
-        new String[] {"David", "has", "5000", "bones"},
+        new String[] {"david", "has", "5000", "bones"},
         new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
   }
   
@@ -261,61 +265,6 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
   }
   
-  /** simple emoji */
-  public void testEmoji() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
-        new String[] { "💩", "💩", "💩" },
-        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
-  }
- 
-  /** emoji zwj sequence */
-  public void testEmojiSequence() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
-        new String[] { "👩‍❤️‍👩" },
-        new String[] { "<EMOJI>" });
-  }
-  
-  /** emoji zwj sequence with fitzpatrick modifier */
-  public void testEmojiSequenceWithModifier() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
-        new String[] { "👨🏼‍⚕️" },
-        new String[] { "<EMOJI>" });
-  }
-  
-  /** regional indicator */
-  public void testEmojiRegionalIndicator() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
-        new String[] { "🇺🇸", "🇺🇸" },
-        new String[] { "<EMOJI>", "<EMOJI>" });
-  }
-  
-  /** variation sequence */
-  public void testEmojiVariationSequence() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
-        new String[] { "#️⃣" },
-        new String[] { "<EMOJI>" });
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
-        new String[] { "3️⃣",},
-        new String[] { "<EMOJI>" });
-  }
-
-  public void testEmojiTagSequence() throws Exception {
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
-        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
-        new String[] { "<EMOJI>" });
-  }
-  
-  public void testEmojiTokenization() throws Exception {
-    // simple emoji around latin
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
-        new String[] { "poo", "💩", "poo" },
-        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
-    // simple emoji around non-latin
-    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
-        new String[] { "💩", "中", "國", "💩" },
-        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
-  }
-  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
index d93a810..75481f1 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
@@ -78,15 +78,6 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
     );
   }
   
-  /**
-   * dictionary segmentation with emoji
-   */
-  public void testSimpleJapaneseWithEmoji() throws Exception {
-    assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
-        new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩"  }
-    );
-  }
-  
   public void testJapaneseTypes() throws Exception {
     assertAnalyzesTo(a, "仮名遣い カタカナ",
         new String[] { "仮名遣い", "カタカナ" },

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/314bcfda/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index 50d1f9f..0410124 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -54,8 +54,6 @@ public final class StandardTokenizer extends Tokenizer {
   public static final int KATAKANA = 5;
   /** Hangul token type */
   public static final int HANGUL = 6;
-  /** Emoji token type. */
-  public static final int EMOJI = 7;
   
   /** String token types that correspond to token type int constants */
   public static final String [] TOKEN_TYPES = new String [] {
@@ -65,8 +63,7 @@ public final class StandardTokenizer extends Tokenizer {
     "<IDEOGRAPHIC>",
     "<HIRAGANA>",
     "<KATAKANA>",
-    "<HANGUL>",
-    "<EMOJI>"
+    "<HANGUL>"
   };
   
   /** Absolute maximum sized token */


[5/6] lucene-solr:branch_7x: Revert "LUCENE-8122: Updata autogenerated code after update to ICU4J 60.2"

Posted by jp...@apache.org.
Revert "LUCENE-8122: Updata autogenerated code after update to ICU4J 60.2"

This reverts commit b3677c1a091209409590de3ec6bafde089323598.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/5ad93d4d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5ad93d4d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5ad93d4d

Branch: refs/heads/branch_7x
Commit: 5ad93d4df621c6ae19f0ac7b6c72366ef24cd082
Parents: 314bcfd
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Feb 20 14:42:58 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 20 14:42:58 2018 +0100

----------------------------------------------------------------------
 .../lucene/analysis/util/UnicodeProps.java      | 116 +++++++++----------
 1 file changed, 58 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5ad93d4d/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
index 254977f..00ee311 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
@@ -1,58 +1,58 @@
-// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.util;
-
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.SparseFixedBitSet;
-
-/**
- * This file contains unicode properties used by various {@link CharTokenizer}s.
- * The data was created using ICU4J v60.2.0.0
- * <p>
- * Unicode version: 10.0.0.0
- */
-public final class UnicodeProps {
-  private UnicodeProps() {}
-  
-  /** Unicode version that was used to generate this file: {@value} */
-  public static final String UNICODE_VERSION = "10.0.0.0";
-  
-  /** Bitset with Unicode WHITESPACE code points. */
-  public static final Bits WHITESPACE = createBits(
-    0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 
-    0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
-  
-  private static Bits createBits(final int... codepoints) {
-    final int len = codepoints[codepoints.length - 1] + 1;
-    final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
-    for (int i : codepoints) bitset.set(i);
-    return new Bits() {
-      @Override
-      public boolean get(int index) {
-        return index < len && bitset.get(index);
-      }
-      
-      @Override
-      public int length() {
-        return 0x10FFFF + 1;
-      }
-    };
-  }
-}
+// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.util;
+
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
+
+/**
+ * This file contains unicode properties used by various {@link CharTokenizer}s.
+ * The data was created using ICU4J v59.1.0.0
+ * <p>
+ * Unicode version: 9.0.0.0
+ */
+public final class UnicodeProps {
+  private UnicodeProps() {}
+  
+  /** Unicode version that was used to generate this file: {@value} */
+  public static final String UNICODE_VERSION = "9.0.0.0";
+  
+  /** Bitset with Unicode WHITESPACE code points. */
+  public static final Bits WHITESPACE = createBits(
+    0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 
+    0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
+  
+  private static Bits createBits(final int... codepoints) {
+    final int len = codepoints[codepoints.length - 1] + 1;
+    final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
+    for (int i : codepoints) bitset.set(i);
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        return index < len && bitset.get(index);
+      }
+      
+      @Override
+      public int length() {
+        return 0x10FFFF + 1;
+      }
+    };
+  }
+}


[2/6] lucene-solr:master: Revert "LUCENE-8122: Updata autogenerated code after update to ICU4J 60.2"

Posted by jp...@apache.org.
Revert "LUCENE-8122: Updata autogenerated code after update to ICU4J 60.2"

This reverts commit d99bfa4bdb3442581bd9559b289887a8bc44c957.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/9a7b56b9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/9a7b56b9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/9a7b56b9

Branch: refs/heads/master
Commit: 9a7b56b9df3c8878c8faafda9cccc1e8ed7db983
Parents: fafbb26
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Feb 20 14:40:01 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 20 14:40:01 2018 +0100

----------------------------------------------------------------------
 .../lucene/analysis/util/UnicodeProps.java      | 116 +++++++++----------
 1 file changed, 58 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/9a7b56b9/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
index 254977f..00ee311 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
@@ -1,58 +1,58 @@
-// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.util;
-
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.SparseFixedBitSet;
-
-/**
- * This file contains unicode properties used by various {@link CharTokenizer}s.
- * The data was created using ICU4J v60.2.0.0
- * <p>
- * Unicode version: 10.0.0.0
- */
-public final class UnicodeProps {
-  private UnicodeProps() {}
-  
-  /** Unicode version that was used to generate this file: {@value} */
-  public static final String UNICODE_VERSION = "10.0.0.0";
-  
-  /** Bitset with Unicode WHITESPACE code points. */
-  public static final Bits WHITESPACE = createBits(
-    0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 
-    0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
-  
-  private static Bits createBits(final int... codepoints) {
-    final int len = codepoints[codepoints.length - 1] + 1;
-    final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
-    for (int i : codepoints) bitset.set(i);
-    return new Bits() {
-      @Override
-      public boolean get(int index) {
-        return index < len && bitset.get(index);
-      }
-      
-      @Override
-      public int length() {
-        return 0x10FFFF + 1;
-      }
-    };
-  }
-}
+// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.util;
+
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
+
+/**
+ * This file contains unicode properties used by various {@link CharTokenizer}s.
+ * The data was created using ICU4J v59.1.0.0
+ * <p>
+ * Unicode version: 9.0.0.0
+ */
+public final class UnicodeProps {
+  private UnicodeProps() {}
+  
+  /** Unicode version that was used to generate this file: {@value} */
+  public static final String UNICODE_VERSION = "9.0.0.0";
+  
+  /** Bitset with Unicode WHITESPACE code points. */
+  public static final Bits WHITESPACE = createBits(
+    0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 
+    0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
+  
+  private static Bits createBits(final int... codepoints) {
+    final int len = codepoints[codepoints.length - 1] + 1;
+    final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
+    for (int i : codepoints) bitset.set(i);
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        return index < len && bitset.get(index);
+      }
+      
+      @Override
+      public int length() {
+        return 0x10FFFF + 1;
+      }
+    };
+  }
+}