You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2018/01/14 16:03:52 UTC
lucene-solr:branch_7x: LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 6f85f17bb -> c9916e304


LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c9916e30
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c9916e30
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c9916e30

Branch: refs/heads/branch_7x
Commit: c9916e3048e98371f056b96cdbaa996f1f36a2fa
Parents: 6f85f17
Author: Robert Muir <rm...@apache.org>
Authored: Sun Jan 14 10:53:51 2018 -0500
Committer: Robert Muir <rm...@apache.org>
Committed: Sun Jan 14 10:54:26 2018 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 +
 .../icu/segmentation/BreakIteratorWrapper.java  | 190 ++++++-------------
 .../segmentation/CompositeBreakIterator.java    |   2 +-
 .../segmentation/DefaultICUTokenizerConfig.java |  18 +-
 .../icu/segmentation/ICUTokenizerConfig.java    |   9 +-
 .../icu/segmentation/ICUTokenizerFactory.java   |   4 +-
 .../icu/segmentation/TestICUTokenizer.java      |  99 +++++++---
 .../icu/segmentation/TestICUTokenizerCJK.java   |   9 +
 .../analysis/standard/StandardTokenizer.java    |   5 +-
 9 files changed, 170 insertions(+), 168 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f65ccb8..98c5f92 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -45,6 +45,8 @@ Improvements
   position sensitive (e.g. part of a phrase) by having an accurate freq.
   (David Smiley)
 
+* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
+
 Bug Fixes
 
 * LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
index d8ecb77..9e5050d 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
@@ -16,152 +16,84 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
-
-import java.text.CharacterIterator;
-
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
 
 /**
- * Contain all the issues surrounding BreakIterators in ICU in one place.
- * Basically this boils down to the fact that they aren't very friendly to any
- * sort of OO design.
- * <p>
- * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
- * BreakIterator from RuleBasedBreakIterator
- * <p>
- * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
- * doesn't actually behave as a subclass: it always returns 0 for
- * getRuleStatus(): 
- * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
- * tags
+ * Wraps RuleBasedBreakIterator, making object reuse convenient and 
+ * emitting a rule status for emoji sequences.
  * @lucene.experimental
  */
-abstract class BreakIteratorWrapper {
-  protected final CharArrayIterator textIterator = new CharArrayIterator();
-  protected char text[];
-  protected int start;
-  protected int length;
-
-  abstract int next();
-  abstract int current();
-  abstract int getRuleStatus();
-  abstract void setText(CharacterIterator text);
-
-  void setText(char text[], int start, int length) {
-    this.text = text;
-    this.start = start;
-    this.length = length;
-    textIterator.setText(text, start, length);
-    setText(textIterator);
+final class BreakIteratorWrapper {
+  private final CharArrayIterator textIterator = new CharArrayIterator();
+  private final RuleBasedBreakIterator rbbi;
+  private char text[];
+  private int start;
+  private int status;
+  
+  BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
+    this.rbbi = rbbi;
   }
-
-  /**
-   * If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
-   * any other BreakIterator, the rulestatus method is not available, so treat
-   * it like a generic BreakIterator.
-   */
-  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
-    if (breakIterator instanceof RuleBasedBreakIterator)
-      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
-    else
-      return new BIWrapper(breakIterator);
+  
+  int current() {
+    return rbbi.current();
   }
 
-  /**
-   * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
-   * a DictionaryBasedBreakIterator) behaves correctly.
-   */
-  static final class RBBIWrapper extends BreakIteratorWrapper {
-    private final RuleBasedBreakIterator rbbi;
-
-    RBBIWrapper(RuleBasedBreakIterator rbbi) {
-      this.rbbi = rbbi;
-    }
-
-    @Override
-    int current() {
-      return rbbi.current();
-    }
+  int getRuleStatus() {
+    return status;
+  }
 
-    @Override
-    int getRuleStatus() {
+  int next() {
+    int current = rbbi.current();
+    int next = rbbi.next();
+    status = calcStatus(current, next);
+    return next;
+  }
+  
+  /** Returns current rule status for the text between breaks. (determines token type) */
+  private int calcStatus(int current, int next) {
+    // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
+    // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
+    if (next != BreakIterator.DONE && isEmoji(current, next)) {
+      return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
+    } else {
       return rbbi.getRuleStatus();
     }
-
-    @Override
-    int next() {
-      return rbbi.next();
-    }
-
-    @Override
-    void setText(CharacterIterator text) {
-      rbbi.setText(text);
-    }
   }
-
-  /**
-   * Generic BreakIterator wrapper: Either the rulestatus method is not
-   * available or always returns 0. Calculate a rulestatus here so it behaves
-   * like RuleBasedBreakIterator.
-   * 
-   * Note: This is slower than RuleBasedBreakIterator.
-   */
-  static final class BIWrapper extends BreakIteratorWrapper {
-    private final BreakIterator bi;
-    private int status;
-
-    BIWrapper(BreakIterator bi) {
-      this.bi = bi;
-    }
-
-    @Override
-    int current() {
-      return bi.current();
-    }
-
-    @Override
-    int getRuleStatus() {
-      return status;
-    }
-
-    @Override
-    int next() {
-      int current = bi.current();
-      int next = bi.next();
-      status = calcStatus(current, next);
-      return next;
-    }
-
-    private int calcStatus(int current, int next) {
-      if (current == BreakIterator.DONE || next == BreakIterator.DONE)
-        return RuleBasedBreakIterator.WORD_NONE;
-
-      int begin = start + current;
-      int end = start + next;
-
-      int codepoint;
-      for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
-        codepoint = UTF16.charAt(text, 0, end, begin);
-
-        if (UCharacter.isDigit(codepoint))
-          return RuleBasedBreakIterator.WORD_NUMBER;
-        else if (UCharacter.isLetter(codepoint)) {
-          // TODO: try to separately specify ideographic, kana? 
-          // [currently all bundled as letter for this case]
-          return RuleBasedBreakIterator.WORD_LETTER;
-        }
+  
+  // See unicode doc L2/16-315 and also the RBBI rules for rationale.
+  // we don't include regional indicators here, because they aren't ambiguous for tagging,
+  // they need only be treated special for segmentation.
+  static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
+
+  /** Returns true if the current text represents emoji character or sequence */
+  private boolean isEmoji(int current, int next) {
+    int begin = start + current;
+    int end = start + next;
+    int codepoint = UTF16.charAt(text, 0, end, begin);
+    // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
+    if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
+      if (EMOJI_RK.contains(codepoint)) {
+        // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
+        // an emoji presentation selector or keycap follows.
+        int trailer = begin + Character.charCount(codepoint);
+        return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
+      } else {
+        return true;
       }
-
-      return RuleBasedBreakIterator.WORD_NONE;
     }
+    return false;
+  }
 
-    @Override
-    void setText(CharacterIterator text) {
-      bi.setText(text);
-      status = RuleBasedBreakIterator.WORD_NONE;
-    }
+  void setText(char text[], int start, int length) {
+    this.text = text;
+    this.start = start;
+    textIterator.setText(text, start, length);
+    rbbi.setText(textIterator);
+    status = RuleBasedBreakIterator.WORD_NONE;
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
index 096eada..3cb39ed 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
@@ -123,7 +123,7 @@ final class CompositeBreakIterator {
   
   private BreakIteratorWrapper getBreakIterator(int scriptCode) {
     if (wordBreakers[scriptCode] == null)
-      wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
+      wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
     return wordBreakers[scriptCode];
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
index 50a6b4c..10e6c67 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@@ -52,6 +52,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
   /** Token type for words that appear to be numbers */
   public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
+  /** Token type for words that appear to be emoji sequences */
+  public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
   
   /*
    * the default breakiterators in use. these can be expensive to
@@ -65,9 +67,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
 
   // the same as ROOT, except no dictionary segmentation for cjk
-  private static final BreakIterator defaultBreakIterator = 
+  private static final RuleBasedBreakIterator defaultBreakIterator = 
     readBreakIterator("Default.brk");
-  private static final BreakIterator myanmarSyllableIterator = 
+  private static final RuleBasedBreakIterator myanmarSyllableIterator = 
     readBreakIterator("MyanmarSyllable.brk");
   
   // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
@@ -95,16 +97,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   }
 
   @Override
-  public BreakIterator getBreakIterator(int script) {
+  public RuleBasedBreakIterator getBreakIterator(int script) {
     switch(script) {
-      case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
+      case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
       case UScript.MYANMAR: 
         if (myanmarAsWords) {
-          return (BreakIterator)defaultBreakIterator.clone();
+          return (RuleBasedBreakIterator)defaultBreakIterator.clone();
         } else {
-          return (BreakIterator)myanmarSyllableIterator.clone();
+          return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
         }
-      default: return (BreakIterator)defaultBreakIterator.clone();
+      default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
     }
   }
 
@@ -119,6 +121,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
         return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
       case RuleBasedBreakIterator.WORD_NUMBER:
         return WORD_NUMBER;
+      case EMOJI_SEQUENCE_STATUS:
+        return WORD_EMOJI;
       default: /* some other custom code */
         return "<OTHER>";
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
index 69694fc..e2d3dae 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
@@ -16,8 +16,7 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
-
-import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
 
 /**
  * Class that allows for tailored Unicode Text Segmentation on
@@ -25,14 +24,16 @@ import com.ibm.icu.text.BreakIterator;
  * @lucene.experimental
  */
 public abstract class ICUTokenizerConfig {
-  
+  /** Rule status for emoji sequences */
+  public static final int EMOJI_SEQUENCE_STATUS = 299;
+
   /**
    * Sole constructor. (For invocation by subclass 
    * constructors, typically implicit.)
    */
   public ICUTokenizerConfig() {}
   /** Return a breakiterator capable of processing a given script. */
-  public abstract BreakIterator getBreakIterator(int script);
+  public abstract RuleBasedBreakIterator getBreakIterator(int script);
   /** Return a token type value for a given script and BreakIterator
    *  rule status. */
   public abstract String getType(int script, int ruleStatus);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
index 4d29b0c..0cd4cf2 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
@@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
       config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
         
         @Override
-        public BreakIterator getBreakIterator(int script) {
+        public RuleBasedBreakIterator getBreakIterator(int script) {
           if (breakers[script] != null) {
-            return (BreakIterator) breakers[script].clone();
+            return (RuleBasedBreakIterator) breakers[script].clone();
           } else {
             return super.getBreakIterator(script);
           }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
index 027baa3..9893975 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@@ -16,13 +16,10 @@
  */
 package org.apache.lucene.analysis.icu.segmentation;
 
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
 
 import com.ibm.icu.lang.UScript;
@@ -76,8 +73,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
-        TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
-        return new TokenStreamComponents(tokenizer, filter);
+        return new TokenStreamComponents(tokenizer);
       }
     };
   }
@@ -90,8 +86,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
 
   public void testArmenian() throws Exception {
     assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
-        new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
-        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
+        new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", 
+        "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
   }
   
   public void testAmharic() throws Exception {
@@ -102,12 +98,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   public void testArabic() throws Exception {
     assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
         new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
-        "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); 
+        "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); 
   }
   
   public void testAramaic() throws Exception {
     assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
-        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
+        new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
         "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
   }
   
@@ -125,7 +121,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   
   public void testGreek() throws Exception {
     assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
-        new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
+        new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
         "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
   }
   
@@ -156,7 +152,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
    */
   public void testChinese() throws Exception {
     assertAnalyzesTo(a, "我是中国人。 １２３４ Ｔｅｓｔｓ ",
-        new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
+        new String[] { "我", "是", "中", "国", "人", "１２３４", "Ｔｅｓｔｓ"});
   }
   
   public void testHebrew() throws Exception {
@@ -186,8 +182,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   /* Tests from StandardAnalyzer, just to show behavior is similar */
   public void testAlphanumericSA() throws Exception {
     // alphanumeric tokens
-    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
-    assertAnalyzesTo(a, "2B", new String[]{"2b"});
+    assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
+    assertAnalyzesTo(a, "2B", new String[]{"2B"});
   }
 
   public void testDelimitersSA() throws Exception {
@@ -199,34 +195,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
 
   public void testApostrophesSA() throws Exception {
     // internal apostrophes: O'Reilly, you're, O'Reilly's
-    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
     assertAnalyzesTo(a, "you're", new String[]{"you're"});
     assertAnalyzesTo(a, "she's", new String[]{"she's"});
-    assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
     assertAnalyzesTo(a, "don't", new String[]{"don't"});
-    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
   }
 
   public void testNumericSA() throws Exception {
     // floating point, serial, model numbers, ip addresses, etc.
     // every other segment must have at least one digit
     assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
-    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
   }
 
   public void testTextWithNumbersSA() throws Exception {
     // numbers
-    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
   }
 
   public void testVariousTextSA() throws Exception {
     // various
-    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
-    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
-    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
-    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
   }
 
   public void testKoreanSA() throws Exception {
@@ -242,14 +238,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
   
   public void testOffsets() throws Exception {
     assertAnalyzesTo(a, "David has 5000 bones", 
-        new String[] {"david", "has", "5000", "bones"},
+        new String[] {"David", "has", "5000", "bones"},
         new int[] {0, 6, 10, 15},
         new int[] {5, 9, 14, 20});
   }
   
   public void testTypes() throws Exception {
     assertAnalyzesTo(a, "David has 5000 bones", 
-        new String[] {"david", "has", "5000", "bones"},
+        new String[] {"David", "has", "5000", "bones"},
         new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
   }
   
@@ -265,6 +261,61 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
   }
   
+  /** simple emoji */
+  public void testEmoji() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
+        new String[] { "💩", "💩", "💩" },
+        new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
+  }
+ 
+  /** emoji zwj sequence */
+  public void testEmojiSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
+        new String[] { "👩‍❤️‍👩" },
+        new String[] { "<EMOJI>" });
+  }
+  
+  /** emoji zwj sequence with fitzpatrick modifier */
+  public void testEmojiSequenceWithModifier() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
+        new String[] { "👨🏼‍⚕️" },
+        new String[] { "<EMOJI>" });
+  }
+  
+  /** regional indicator */
+  public void testEmojiRegionalIndicator() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
+        new String[] { "🇺🇸", "🇺🇸" },
+        new String[] { "<EMOJI>", "<EMOJI>" });
+  }
+  
+  /** variation sequence */
+  public void testEmojiVariationSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
+        new String[] { "#️⃣" },
+        new String[] { "<EMOJI>" });
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
+        new String[] { "3️⃣",},
+        new String[] { "<EMOJI>" });
+  }
+
+  public void testEmojiTagSequence() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
+        new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
+        new String[] { "<EMOJI>" });
+  }
+  
+  public void testEmojiTokenization() throws Exception {
+    // simple emoji around latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
+        new String[] { "poo", "💩", "poo" },
+        new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
+    // simple emoji around non-latin
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
+        new String[] { "💩", "中", "國", "💩" },
+        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
index 75481f1..d93a810 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
@@ -78,6 +78,15 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
     );
   }
   
+  /**
+   * dictionary segmentation with emoji
+   */
+  public void testSimpleJapaneseWithEmoji() throws Exception {
+    assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
+        new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩"  }
+    );
+  }
+  
   public void testJapaneseTypes() throws Exception {
     assertAnalyzesTo(a, "仮名遣い カタカナ",
         new String[] { "仮名遣い", "カタカナ" },

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c9916e30/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index 0410124..50d1f9f 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -54,6 +54,8 @@ public final class StandardTokenizer extends Tokenizer {
   public static final int KATAKANA = 5;
   /** Hangul token type */
   public static final int HANGUL = 6;
+  /** Emoji token type. */
+  public static final int EMOJI = 7;
   
   /** String token types that correspond to token type int constants */
   public static final String [] TOKEN_TYPES = new String [] {
@@ -63,7 +65,8 @@ public final class StandardTokenizer extends Tokenizer {
     "<IDEOGRAPHIC>",
     "<HIRAGANA>",
     "<KATAKANA>",
-    "<HANGUL>"
+    "<HANGUL>",
+    "<EMOJI>"
   };
   
   /** Absolute maximum sized token */