You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2011/01/06 14:51:11 UTC
svn commit: r1055877 [3/3] - in /lucene/dev/trunk/modules/analysis: ./
common/ common/src/java/org/apache/lucene/analysis/standard/
common/src/test/org/apache/lucene/analysis/core/ icu/
icu/src/tools/java/org/apache/lucene/analysis/icu/
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Thu Jan 6 13:51:10 2011
@@ -45,14 +45,6 @@ import org.apache.lucene.util.AttributeS
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
* <li><HIRAGANA>: A single hiragana character</li>
* </ul>
- * <b>WARNING</b>: Because JFlex does not support Unicode supplementary
- * characters (characters above the Basic Multilingual Plane, which contains
- * those up to and including U+FFFF), this scanner will not recognize them
- * properly. If you need to be able to process text containing supplementary
- * characters, consider using the ICU4J-backed implementation in modules/analysis/icu
- * (org.apache.lucene.analysis.icu.segmentation.ICUTokenizer)
- * instead of this class, since the ICU4J-backed implementation does not have
- * this limitation.
*/
%%
@@ -70,15 +62,30 @@ import org.apache.lucene.util.AttributeS
super(in);
%init}
+
+%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
+Format = ([\p{WB:Format}] | {FormatSupp})
+Numeric = ([\p{WB:Numeric}] | {NumericSupp})
+Extend = ([\p{WB:Extend}] | {ExtendSupp})
+Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
+MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
+MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
+MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
+ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
+ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
+Han = ([\p{Script:Han}] | {HanSupp})
+Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+
// UAX#29 WB4. X (Extend | Format)* --> X
//
-ALetterEx = \p{WB:ALetter} [\p{WB:Format}\p{WB:Extend}]*
+ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
-NumericEx = [\p{WB:Numeric}\uFF10-\uFF19] [\p{WB:Format}\p{WB:Extend}]*
-KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
-MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
-MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}] [\p{WB:Format}\p{WB:Extend}]*
-ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
+NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
+KatakanaEx = {Katakana} ({Format} | {Extend})*
+MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
+MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
+ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
@@ -348,12 +355,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
//
// http://www.unicode.org/reports/tr14/#SA
//
-\p{LB:Complex_Context}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
+{ComplexContext}+ { if (populateAttributes(SOUTH_EAST_ASIAN_TYPE)) return true; }
// UAX#29 WB14. Any ÷ Any
//
-\p{Script:Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
-\p{Script:Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
+{Han} { if (populateAttributes(IDEOGRAPHIC_TYPE)) return true; }
+{Hiragana} { if (populateAttributes(HIRAGANA_TYPE)) return true; }
// UAX#29 WB3. CR Ã LF
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Thu Jan 6 13:51:10 2011
@@ -201,4 +201,10 @@ public class TestStandardAnalyzer extend
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);
}
+
+ public void testSupplementary() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ð©¬
è±éä¹æ¯ç",
+ new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Thu Jan 6 13:51:10 2011
@@ -400,4 +400,10 @@ public class TestUAX29URLEmailTokenizer
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
wordBreakTest.test(a);
}
+
+ public void testSupplementary() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ð©¬
è±éä¹æ¯ç",
+ new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ }
}
Modified: lucene/dev/trunk/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/build.xml?rev=1055877&r1=1055876&r2=1055877&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/build.xml (original)
+++ lucene/dev/trunk/modules/analysis/icu/build.xml Thu Jan 6 13:51:10 2011
@@ -107,6 +107,23 @@ are part of the ICU4C package. See http:
</java>
</target>
+ <property name="uax29.supp.macros.output.file"
+ location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
+
+ <target name="gen-uax29-supp-macros" depends="compile-tools">
+ <java
+ classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
+ dir="."
+ fork="true"
+ failonerror="true"
+ output="${uax29.supp.macros.output.file}">
+ <classpath>
+ <path refid="additional.dependencies"/>
+ <pathelement location="${build.dir}/classes/tools"/>
+ </classpath>
+ </java>
+ </target>
+
<target name="compile-tools">
<compile
srcdir="src/tools/java"
Added: lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java?rev=1055877&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java (added)
+++ lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateJFlexSupplementaryMacros.java Thu Jan 6 13:51:10 2011
@@ -0,0 +1,76 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+
+/** creates a macro to augment jflex's unicode wordbreak support for > BMP */
+public class GenerateJFlexSupplementaryMacros {
+ private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
+
+ public static void main(String args[]) throws Exception {
+ outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
+ outputMacro("FormatSupp", "[:WordBreak=Format:]");
+ outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
+ outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
+ outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
+ outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
+ outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
+ outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
+ outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
+ outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
+ outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
+ outputMacro("HanSupp", "[:Script=Han:]");
+ outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
+ }
+
+ // we have to carefully output the possibilities as compact utf-16
+ // range expressions, or jflex will OOM!
+ static void outputMacro(String name, String pattern) {
+ UnicodeSet set = new UnicodeSet(pattern);
+ set.removeAll(BMP);
+ System.out.println(name + " = (");
+ // if the set is empty, we have to do this or jflex will barf
+ if (set.isEmpty()) {
+ System.out.println("\t []");
+ }
+
+ HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
+ for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+ char utf16[] = Character.toChars(it.codepoint);
+ UnicodeSet trails = utf16ByLead.get(utf16[0]);
+ if (trails == null) {
+ trails = new UnicodeSet();
+ utf16ByLead.put(utf16[0], trails);
+ }
+ trails.add(utf16[1]);
+ }
+
+ boolean isFirst = true;
+ for (Character c : utf16ByLead.keySet()) {
+ UnicodeSet trail = utf16ByLead.get(c);
+ System.out.print( isFirst ? "\t " : "\t| ");
+ isFirst = false;
+ System.out.println("([\\u" + Integer.toHexString(c) + "]" + trail.getRegexEquivalent() + ")");
+ }
+ System.out.println(")");
+ }
+}