You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2011/01/06 15:54:49 UTC
svn commit: r1055904 [3/3] - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/icu/
lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/
lucene/src/java/org/apache/lucene/analysis/standard/
lucene/src/test/org/apache/lucene/anal...
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1055904&r1=1055903&r2=1055904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Thu Jan 6 14:54:48 2011
@@ -45,14 +45,6 @@ import org.apache.lucene.util.AttributeS
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
* <li><HIRAGANA>: A single hiragana character</li>
* </ul>
- * <b>WARNING</b>: Because JFlex does not support Unicode supplementary
- * characters (characters above the Basic Multilingual Plane, which contains
- * those up to and including U+FFFF), this scanner will not recognize them
- * properly. If you need to be able to process text containing supplementary
- * characters, consider using the ICU4J-backed implementation in modules/analysis/icu
- * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
- * instead of this class, since the ICU4J-backed implementation does not have
- * this limitation.
*/
%%
@@ -70,15 +62,30 @@ import org.apache.lucene.util.AttributeS
super(in);
%init}
+
+%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
+Format = ([\p{WB:Format}] | {FormatSupp})
+Numeric = ([\p{WB:Numeric}] | {NumericSupp})
+Extend = ([\p{WB:Extend}] | {ExtendSupp})
+Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
+MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
+MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
+MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
+ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
+ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
+Han = ([\p{Script:Han}] | {HanSupp})
+Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+
// UAX#29 WB4. X (Extend | Format)* --> X
//
-ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
+ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
-NumericEx = [\p{WB:Numeric}\uFF10-\uFF19] [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
-MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
-MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
+NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
+KatakanaEx = {Katakana} ({Format} | {Extend})*
+MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
+MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
+ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
@@ -348,12 +355,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
//
// http://www.unicode.org/reports/tr14/#SA
//
-\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
+{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
// UAX#29 WB14. Any ÷ Any
//
-\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
-\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
+{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
+{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
// UAX#29 WB3. CR Ã LF
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=1055904&r1=1055903&r2=1055904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Thu Jan 6 14:54:48 2011
@@ -197,4 +197,10 @@ public class TestStandardAnalyzer extend
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);
}
+
+ public void testSupplementary() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ð©¬
è±éä¹æ¯ç",
+ new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java?rev=1055904&r1=1055903&r2=1055904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java Thu Jan 6 14:54:48 2011
@@ -394,4 +394,10 @@ public class TestUAX29URLEmailTokenizer
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);
}
+
+ public void testSupplementary() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ð©¬
è±éä¹æ¯ç",
+ new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ }
}