You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/09 18:07:47 UTC

svn commit: r1068979 [3/3] - in /lucene/dev/trunk: lucene/ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/ modules/analysis/common/src/test/org/apache/lucene/a...

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Wed Feb  9 17:07:46 2011
@@ -77,6 +77,8 @@ ComplexContext = ([\p{LB:Complex_Context
 Han = ([\p{Script:Han}] | {HanSupp})
 Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
 
+// Script=Hangul & Aletter
+HangulEx       = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
 ALetterEx      = {ALetter}                     ({Format} | {Extend})*
@@ -168,16 +170,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 
 %{
   /** Alphanumeric sequences */
-  public static final String WORD_TYPE = "<ALPHANUM>";
+  public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
   
   /** Numbers */
-  public static final String NUMERIC_TYPE = "<NUM>";
+  public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
   
   /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
   public static final String URL_TYPE = "<URL>";
   
   /** E-mail addresses */
-  public static final String EMAIL_TYPE = "<EMAIL";
+  public static final String EMAIL_TYPE = "<EMAIL>";
   
   /**
    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@@ -187,12 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
    * <p>
    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
    */
-  public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
+  public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
   
-  public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
+  public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
   
-  public static final String HIRAGANA_TYPE = "<HIRAGANA>";
+  public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
   
+  public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+
+  public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt 
@@ -316,6 +322,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 {ExtendNumLetEx}* 
   { if (populateAttributes(NUMERIC_TYPE)) return true; }
 
+// subset of the below for typing purposes only!
+{HangulEx}+
+  { if (populateAttributes(HANGUL_TYPE)) return true; }
+
+{KatakanaEx}+
+  { if (populateAttributes(KATAKANA_TYPE)) return true; }
 
 // UAX#29 WB5.   ALetter × ALetter
 //        WB6.   ALetter × (MidLetter | MidNumLet) ALetter

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Wed Feb  9 17:07:46 2011
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */
 
 package org.apache.lucene.analysis.wikipedia;
 
@@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokena
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * on 2/9/11 11:45 AM from the specification file
+ * <tt>C:/Users/rmuir/workspace/lucene-2911/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
  */
 class WikipediaTokenizerImpl {
 
@@ -757,6 +757,12 @@ final int setText(StringBuilder buffer){
   
       zzState = ZZ_LEXSTATE[zzLexicalState];
 
+      // set up zzAction for empty match case:
+      int zzAttributes = zzAttrL[zzState];
+      if ( (zzAttributes & 1) == 1 ) {
+        zzAction = zzState;
+      }
+
 
       zzForAction: {
         while (true) {
@@ -789,7 +795,7 @@ final int setText(StringBuilder buffer){
           if (zzNext == -1) break zzForAction;
           zzState = zzNext;
 
-          int zzAttributes = zzAttrL[zzState];
+          zzAttributes = zzAttrL[zzState];
           if ( (zzAttributes & 1) == 1 ) {
             zzAction = zzState;
             zzMarkedPosL = zzCurrentPosL;

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Wed Feb  9 17:07:46 2011
@@ -207,4 +207,16 @@ public class TestStandardAnalyzer extend
         new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
   }
+  
+  public void testKorean() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음",
+        new String[] { "훈민정음" },
+        new String[] { "<HANGUL>" });
+  }
+  
+  public void testJapanese() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ",
+        new String[] { "仮", "名", "遣", "い", "カタカナ" },
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+  }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Wed Feb  9 17:07:46 2011
@@ -406,4 +406,16 @@ public class TestUAX29URLEmailTokenizer 
         new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
         new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
   }
+  
+  public void testKorean() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음",
+        new String[] { "훈민정음" },
+        new String[] { "<HANGUL>" });
+  }
+  
+  public void testJapanese() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ",
+        new String[] { "仮", "名", "遣", "い", "カタカナ" },
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+  }
 }

Added: lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi?rev=1068979&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi (added)
+++ lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi Wed Feb  9 17:07:46 2011
@@ -0,0 +1,127 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Default RBBI rules, based on UAX#29.
+#
+
+!!chain;
+
+#
+#  Character Class Definitions.
+#
+
+$CR           = [\p{Word_Break = CR}];
+$LF           = [\p{Word_Break = LF}];
+$Newline      = [\p{Word_Break = Newline}];
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$Katakana     = [\p{Word_Break = Katakana}];
+$ALetter      = [\p{Word_Break = ALetter}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidLetter    = [\p{Word_Break = MidLetter}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+
+#   Dictionary character set, for triggering language-based break engines. Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include all
+#   characters requiring dictionary break.
+
+$dictionary   = [:LineBreak = Complex_Context:];
+$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
+                                                             #  include the dictionary characters.
+
+#
+#  Rules 4    Ignore Format and Extend characters, 
+#             except when they appear at the beginning of a region of text.
+#
+$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
+$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+$Hiragana       = [\p{script=Hiragana}];
+$Ideographic    = [\p{Ideographic}];
+$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
+$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
+
+## -------------------------------------------------
+
+!!forward;
+
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
+#          of a region of Text.   The rule here comes into play when the start of text
+#          begins with a group of Format chars, or with a "word" consisting of a single
+#          char that is not in any of the listed word break categories followed by
+#          format char(s).
+[^$CR $LF $Newline]? ($Extend |  $Format)+;
+
+$NumericEx {100};
+$ALetterEx {200};
+$KatakanaEx {300};       # note:  these status values override those from rule 5
+$HiraganaEx {300};       #        by virtual of being numerically larger.
+$IdeographicEx {400};    #
+
+#
+# rule 5
+#    Do not break between most letters.
+#
+$ALetterEx $ALetterEx {200};
+
+# rule 6 and 7
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+
+# rule 8
+
+$NumericEx $NumericEx {100};
+
+# rule 9
+
+$ALetterEx $NumericEx {200};
+
+# rule 10
+
+$NumericEx $ALetterEx {200};
+
+# rule 11 and 12 
+
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+
+# rule 13
+
+$KatakanaEx  $KatakanaEx {300};
+
+# rule 13a/b
+
+$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
+$NumericEx      $ExtendNumLetEx {100};    #  (13a)
+$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
+$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
+
+$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
+$ExtendNumLetEx $NumericEx  {100};    #  (13b)
+$ExtendNumLetEx $KatakanaEx {300};    #  (13b)

Modified: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java Wed Feb  9 17:07:46 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.icu.s
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
@@ -44,20 +46,24 @@ import com.ibm.icu.util.ULocale;
  */
 public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   /** Token type for words containing ideographic characters */
-  public static final String WORD_IDEO = "<IDEOGRAPHIC>";
-  /** Token type for words containing Japanese kana */
-  public static final String WORD_KANA = "<KANA>";
+  public static final String WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+  /** Token type for words containing Japanese hiragana */
+  public static final String WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+  /** Token type for words containing Japanese katakana */
+  public static final String WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+  /** Token type for words containing Korean hangul  */
+  public static final String WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
   /** Token type for words that contain letters */
-  public static final String WORD_LETTER = "<ALPHANUM>";
+  public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
   /** Token type for words that appear to be numbers */
-  public static final String WORD_NUMBER = "<NUM>";
+  public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
   
   /*
    * the default breakiterators in use. these can be expensive to
    * instantiate, cheap to clone.
    */  
   private static final BreakIterator rootBreakIterator = 
-    BreakIterator.getWordInstance(ULocale.ROOT);
+    readBreakIterator("Default.brk");
   private static final BreakIterator thaiBreakIterator = 
     BreakIterator.getWordInstance(new ULocale("th_TH"));
   private static final BreakIterator hebrewBreakIterator = 
@@ -87,9 +93,9 @@ public class DefaultICUTokenizerConfig e
       case RuleBasedBreakIterator.WORD_IDEO:
         return WORD_IDEO;
       case RuleBasedBreakIterator.WORD_KANA:
-        return WORD_KANA;
+        return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
       case RuleBasedBreakIterator.WORD_LETTER:
-        return WORD_LETTER;
+        return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
       case RuleBasedBreakIterator.WORD_NUMBER:
         return WORD_NUMBER;
       default: /* some other custom code */

Added: lucene/dev/trunk/modules/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk?rev=1068979&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Wed Feb  9 17:07:46 2011
@@ -128,11 +128,10 @@ public class TestICUTokenizer extends Ba
   
   /*
    * For chinese, tokenize as char (these can later form bigrams or whatever)
-   * TODO: why do full-width numerics have no word-break prop?
    */
   public void testChinese() throws Exception {
     assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
-        new String[] { "我", "是", "中", "国", "人", "tests"});
+        new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
   }
   
   public void testEmpty() throws Exception {
@@ -221,4 +220,16 @@ public class TestICUTokenizer extends Ba
         new String[] {"david", "has", "5000", "bones"},
         new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
   }
+  
+  public void testKorean() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "훈민정음",
+        new String[] { "훈민정음" },
+        new String[] { "<HANGUL>" });
+  }
+  
+  public void testJapanese() throws Exception {
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, "仮名遣い カタカナ",
+        new String[] { "仮", "名", "遣", "い", "カタカナ" },
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+  }
 }