You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2011/01/06 14:51:11 UTC

svn commit: r1055877 [3/3] - in /lucene/dev/trunk/modules/analysis: ./ common/ common/src/java/org/apache/lucene/analysis/standard/ common/src/test/org/apache/lucene/analysis/core/ icu/ icu/src/tools/java/org/apache/lucene/analysis/icu/

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Thu Jan  6 13:51:10 2011
@@ -45,14 +45,6 @@ import org.apache.lucene.util.AttributeS
  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  * </ul>
- * <b>WARNING</b>: Because JFlex does not support Unicode supplementary 
- * characters (characters above the Basic Multilingual Plane, which contains
- * those up to and including U+FFFF), this scanner will not recognize them
- * properly.  If you need to be able to process text containing supplementary 
- * characters, consider using the ICU4J-backed implementation in modules/analysis/icu  
- * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
- * instead of this class, since the ICU4J-backed implementation does not have
- * this limitation.
  */
 %%
 
@@ -70,15 +62,30 @@ import org.apache.lucene.util.AttributeS
   super(in);
 %init}
 
+
+%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
+Format =  ([\p{WB:Format}] | {FormatSupp})
+Numeric = ([\p{WB:Numeric}] | {NumericSupp})
+Extend =  ([\p{WB:Extend}] | {ExtendSupp})
+Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
+MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
+MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
+MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
+ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
+ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
+Han = ([\p{Script:Han}] | {HanSupp})
+Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
-ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
+ALetterEx      = {ALetter}                     ({Format} | {Extend})*
 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
-NumericEx      = [\p{WB:Numeric}\uFF10-\uFF19]      [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx     = \p{WB:Katakana}                    [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx    = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
+NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
+KatakanaEx     = {Katakana}                    ({Format} | {Extend})* 
+MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})* 
+MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
+ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
 
 
 // URL and E-mail syntax specifications:
@@ -348,12 +355,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 //
 //    http://www.unicode.org/reports/tr14/#SA
 //
-\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
+{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
 
 // UAX#29 WB14.  Any ÷ Any
 //
-\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
-\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
+{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
+{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
 
 
 // UAX#29 WB3.   CR × LF

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Thu Jan  6 13:51:10 2011
@@ -201,4 +201,10 @@ public class TestStandardAnalyzer extend
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);
   }
+  
+  public void testSupplementary() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "𩬅艱鍟䇹愯瀛", 
+        new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Thu Jan  6 13:51:10 2011
@@ -400,4 +400,10 @@ public class TestUAX29URLEmailTokenizer 
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);
   }
+  
+  public void testSupplementary() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "𩬅艱鍟䇹愯瀛", 
+        new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
 }

Modified: lucene/dev/trunk/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/build.xml?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/build.xml (original)
+++ lucene/dev/trunk/modules/analysis/icu/build.xml Thu Jan  6 13:51:10 2011
@@ -107,6 +107,23 @@ are part of the ICU4C package. See http:
     </java>
   </target>
 			
+  <property name="uax29.supp.macros.output.file" 
+            location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
+
+  <target name="gen-uax29-supp-macros" depends="compile-tools">
+    <java
+      classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
+      dir="."
+      fork="true"
+      failonerror="true"
+      output="${uax29.supp.macros.output.file}">
+      <classpath>
+      	<path refid="additional.dependencies"/>
+      	<pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+			
   <target name="compile-tools">
     <compile
       srcdir="src/tools/java"

Added: lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java?rev=1055877&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java (added)
+++ lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java Thu Jan  6 13:51:10 2011
@@ -0,0 +1,76 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+
+/** creates a macro to augment jflex's unicode wordbreak support for > BMP */
+public class GenerateJFlexSupplementaryMacros {
+  private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
+  
+  public static void main(String args[]) throws Exception {
+    outputMacro("ALetterSupp",         "[:WordBreak=ALetter:]");
+    outputMacro("FormatSupp",          "[:WordBreak=Format:]");
+    outputMacro("ExtendSupp",          "[:WordBreak=Extend:]");
+    outputMacro("NumericSupp",         "[:WordBreak=Numeric:]");
+    outputMacro("KatakanaSupp",        "[:WordBreak=Katakana:]");
+    outputMacro("MidLetterSupp",       "[:WordBreak=MidLetter:]");
+    outputMacro("MidNumSupp",          "[:WordBreak=MidNum:]");
+    outputMacro("MidNumLetSupp",       "[:WordBreak=MidNumLet:]");
+    outputMacro("ExtendNumLetSupp",    "[:WordBreak=ExtendNumLet:]");
+    outputMacro("ExtendNumLetSupp",    "[:WordBreak=ExtendNumLet:]");
+    outputMacro("ComplexContextSupp",  "[:LineBreak=Complex_Context:]");
+    outputMacro("HanSupp",             "[:Script=Han:]");
+    outputMacro("HiraganaSupp",        "[:Script=Hiragana:]");
+  }
+  
+  // we have to carefully output the possibilities as compact utf-16
+  // range expressions, or jflex will OOM!
+  static void outputMacro(String name, String pattern) {
+    UnicodeSet set = new UnicodeSet(pattern);
+    set.removeAll(BMP);
+    System.out.println(name + " = (");
+    // if the set is empty, we have to do this or jflex will barf
+    if (set.isEmpty()) {
+      System.out.println("\t  []");
+    }
+    
+    HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {    
+      char utf16[] = Character.toChars(it.codepoint);
+      UnicodeSet trails = utf16ByLead.get(utf16[0]);
+      if (trails == null) {
+        trails = new UnicodeSet();
+        utf16ByLead.put(utf16[0], trails);
+      }
+      trails.add(utf16[1]);
+    }
+    
+    boolean isFirst = true;
+    for (Character c : utf16ByLead.keySet()) {
+      UnicodeSet trail = utf16ByLead.get(c);
+      System.out.print( isFirst ? "\t  " : "\t| ");
+      isFirst = false;
+      System.out.println("([\\u" + Integer.toHexString(c) + "]" + trail.getRegexEquivalent() + ")");
+    }
+    System.out.println(")");
+  }
+}