You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/09 18:47:15 UTC
svn commit: r1068997 [3/3] - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/icu/src/data/uax29/
lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/
lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segme...
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1068997&r1=1068996&r2=1068997&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Wed Feb 9 17:47:13 2011
@@ -77,6 +77,8 @@ ComplexContext = ([\p{LB:Complex_Context
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+// Script=Hangul & Aletter
+HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
@@ -168,16 +170,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
%{
/** Alphanumeric sequences */
- public static final String WORD_TYPE = "<ALPHANUM>";
+ public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Numbers */
- public static final String NUMERIC_TYPE = "<NUM>";
+ public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final String URL_TYPE = "<URL>";
/** E-mail addresses */
- public static final String EMAIL_TYPE = "<EMAIL";
+ public static final String EMAIL_TYPE = "<EMAIL>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@@ -187,12 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
- public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
+ public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
- public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
+ public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
- public static final String HIRAGANA_TYPE = "<HIRAGANA>";
+ public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+
+ public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
@@ -316,6 +322,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
{ExtendNumLetEx}*
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
+// subset of the below for typing purposes only!
+{HangulEx}+
+ { if (populateAttributes(HANGUL_TYPE)) return true; }
+
+{KatakanaEx}+
+ { if (populateAttributes(KATAKANA_TYPE)) return true; }
// UAX#29 WB5. ALetter à ALetter
// WB6. ALetter à (MidLetter | MidNumLet) ALetter
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=1068997&r1=1068996&r2=1068997&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Wed Feb 9 17:47:13 2011
@@ -203,4 +203,16 @@ public class TestStandardAnalyzer extend
new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java?rev=1068997&r1=1068996&r2=1068997&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java Wed Feb 9 17:47:13 2011
@@ -400,4 +400,16 @@ public class TestUAX29URLEmailTokenizer
new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}