You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2011/01/06 15:54:49 UTC

svn commit: r1055904 [3/3] - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/icu/ lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/ lucene/src/java/org/apache/lucene/analysis/standard/ lucene/src/test/org/apache/lucene/anal...

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1055904&r1=1055903&r2=1055904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Thu Jan  6 14:54:48 2011
@@ -45,14 +45,6 @@ import org.apache.lucene.util.AttributeS
  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  * </ul>
- * <b>WARNING</b>: Because JFlex does not support Unicode supplementary 
- * characters (characters above the Basic Multilingual Plane, which contains
- * those up to and including U+FFFF), this scanner will not recognize them
- * properly.  If you need to be able to process text containing supplementary 
- * characters, consider using the ICU4J-backed implementation in modules/analysis/icu  
- * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
- * instead of this class, since the ICU4J-backed implementation does not have
- * this limitation.
  */
 %%
 
@@ -70,15 +62,30 @@ import org.apache.lucene.util.AttributeS
   super(in);
 %init}
 
+
+%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
+Format =  ([\p{WB:Format}] | {FormatSupp})
+Numeric = ([\p{WB:Numeric}] | {NumericSupp})
+Extend =  ([\p{WB:Extend}] | {ExtendSupp})
+Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
+MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
+MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
+MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
+ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
+ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
+Han = ([\p{Script:Han}] | {HanSupp})
+Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
-ALetterEx      = \p{WB:ALetter}                     [\p{WB:Format}\p{WB:Extend}]*
+ALetterEx      = {ALetter}                     ({Format} | {Extend})*
 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
-NumericEx      = [\p{WB:Numeric}\uFF10-\uFF19]      [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx     = \p{WB:Katakana}                    [\p{WB:Format}\p{WB:Extend}]* 
-MidLetterEx    = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]* 
-MidNumericEx   = [\p{WB:MidNum}\p{WB:MidNumLet}]    [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet}                [\p{WB:Format}\p{WB:Extend}]*
+NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
+KatakanaEx     = {Katakana}                    ({Format} | {Extend})* 
+MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})* 
+MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
+ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
 
 
 // URL and E-mail syntax specifications:
@@ -348,12 +355,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 //
 //    http://www.unicode.org/reports/tr14/#SA
 //
-\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
+{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
 
 // UAX#29 WB14.  Any ÷ Any
 //
-\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
-\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
+{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
+{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
 
 
 // UAX#29 WB3.   CR × LF

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=1055904&r1=1055903&r2=1055904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Thu Jan  6 14:54:48 2011
@@ -197,4 +197,10 @@ public class TestStandardAnalyzer extend
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);
   }
+  
+  public void testSupplementary() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "𩬅艱鍟䇹愯瀛", 
+        new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java?rev=1055904&r1=1055903&r2=1055904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java Thu Jan  6 14:54:48 2011
@@ -394,4 +394,10 @@ public class TestUAX29URLEmailTokenizer 
     WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
     wordBreakTest.test(a);
   }
+  
+  public void testSupplementary() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "𩬅艱鍟䇹愯瀛", 
+        new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
 }