You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/09 18:47:15 UTC

svn commit: r1068997 [3/3] - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/icu/src/data/uax29/ lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segme...

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1068997&r1=1068996&r2=1068997&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Wed Feb  9 17:47:13 2011
@@ -77,6 +77,8 @@ ComplexContext = ([\p{LB:Complex_Context
 Han = ([\p{Script:Han}] | {HanSupp})
 Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
 
+// Script=Hangul & Aletter
+HangulEx       = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
 ALetterEx      = {ALetter}                     ({Format} | {Extend})*
@@ -168,16 +170,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 
 %{
   /** Alphanumeric sequences */
-  public static final String WORD_TYPE = "<ALPHANUM>";
+  public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
   
   /** Numbers */
-  public static final String NUMERIC_TYPE = "<NUM>";
+  public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
   
   /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
   public static final String URL_TYPE = "<URL>";
   
   /** E-mail addresses */
-  public static final String EMAIL_TYPE = "<EMAIL";
+  public static final String EMAIL_TYPE = "<EMAIL>";
   
   /**
    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@@ -187,12 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
    * <p>
    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
    */
-  public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
+  public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
   
-  public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
+  public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
   
-  public static final String HIRAGANA_TYPE = "<HIRAGANA>";
+  public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
   
+  public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+
+  public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt 
@@ -316,6 +322,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 {ExtendNumLetEx}* 
   { if (populateAttributes(NUMERIC_TYPE)) return true; }
 
+// subset of the below for typing purposes only!
+{HangulEx}+
+  { if (populateAttributes(HANGUL_TYPE)) return true; }
+
+{KatakanaEx}+
+  { if (populateAttributes(KATAKANA_TYPE)) return true; }
 
 // UAX#29 WB5.   ALetter × ALetter
 //        WB6.   ALetter × (MidLetter | MidNumLet) ALetter

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=1068997&r1=1068996&r2=1068997&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Wed Feb  9 17:47:13 2011
@@ -203,4 +203,16 @@ public class TestStandardAnalyzer extend
         new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
   }
+  
+  public void testKorean() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음",
+        new String[] { "훈민정음" },
+        new String[] { "<HANGUL>" });
+  }
+  
+  public void testJapanese() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ",
+        new String[] { "仮", "名", "遣", "い", "カタカナ" },
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java?rev=1068997&r1=1068996&r2=1068997&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestUAX29URLEmailTokenizer.java Wed Feb  9 17:47:13 2011
@@ -400,4 +400,16 @@ public class TestUAX29URLEmailTokenizer 
         new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
   }
+  
+  public void testKorean() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음",
+        new String[] { "훈민정음" },
+        new String[] { "<HANGUL>" });
+  }
+  
+  public void testJapanese() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ",
+        new String[] { "仮", "名", "遣", "い", "カタカナ" },
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+  }
 }