You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/09 18:07:47 UTC
svn commit: r1068979 [3/3] - in /lucene/dev/trunk: lucene/
modules/analysis/common/src/java/org/apache/lucene/analysis/standard/
modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/
modules/analysis/common/src/test/org/apache/lucene/a...
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Wed Feb 9 17:07:46 2011
@@ -77,6 +77,8 @@ ComplexContext = ([\p{LB:Complex_Context
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+// Script=Hangul & Aletter
+HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
@@ -168,16 +170,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
%{
/** Alphanumeric sequences */
- public static final String WORD_TYPE = "<ALPHANUM>";
+ public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Numbers */
- public static final String NUMERIC_TYPE = "<NUM>";
+ public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final String URL_TYPE = "<URL>";
/** E-mail addresses */
- public static final String EMAIL_TYPE = "<EMAIL";
+ public static final String EMAIL_TYPE = "<EMAIL>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@@ -187,12 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
- public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
+ public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
- public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
+ public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
- public static final String HIRAGANA_TYPE = "<HIRAGANA>";
+ public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+
+ public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
@@ -316,6 +322,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
{ExtendNumLetEx}*
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
+// subset of the below for typing purposes only!
+{HangulEx}+
+ { if (populateAttributes(HANGUL_TYPE)) return true; }
+
+{KatakanaEx}+
+ { if (populateAttributes(KATAKANA_TYPE)) return true; }
// UAX#29 WB5. ALetter à ALetter
// WB6. ALetter à (MidLetter | MidNumLet) ALetter
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Wed Feb 9 17:07:46 2011
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */
package org.apache.lucene.analysis.wikipedia;
@@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokena
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * on 2/9/11 11:45 AM from the specification file
+ * <tt>C:/Users/rmuir/workspace/lucene-2911/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@@ -757,6 +757,12 @@ final int setText(StringBuilder buffer){
zzState = ZZ_LEXSTATE[zzLexicalState];
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ }
+
zzForAction: {
while (true) {
@@ -789,7 +795,7 @@ final int setText(StringBuilder buffer){
if (zzNext == -1) break zzForAction;
zzState = zzNext;
- int zzAttributes = zzAttrL[zzState];
+ zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Wed Feb 9 17:07:46 2011
@@ -207,4 +207,16 @@ public class TestStandardAnalyzer extend
new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Wed Feb 9 17:07:46 2011
@@ -406,4 +406,16 @@ public class TestUAX29URLEmailTokenizer
new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}
Added: lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi?rev=1068979&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi (added)
+++ lucene/dev/trunk/modules/analysis/icu/src/data/uax29/Default.rbbi Wed Feb 9 17:07:46 2011
@@ -0,0 +1,127 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Default RBBI rules, based on UAX#29.
+#
+
+!!chain;
+
+#
+# Character Class Definitions.
+#
+
+$CR = [\p{Word_Break = CR}];
+$LF = [\p{Word_Break = LF}];
+$Newline = [\p{Word_Break = Newline}];
+$Extend = [\p{Word_Break = Extend}];
+$Format = [\p{Word_Break = Format}];
+$Katakana = [\p{Word_Break = Katakana}];
+$ALetter = [\p{Word_Break = ALetter}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidLetter = [\p{Word_Break = MidLetter}];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+
+# Dictionary character set, for triggering language-based break engines. Currently
+# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
+# 5.0 or later as the definition of Complex_Context was corrected to include all
+# characters requiring dictionary break.
+
+$dictionary = [:LineBreak = Complex_Context:];
+$Control = [\p{Grapheme_Cluster_Break = Control}];
+$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
+ # include the dictionary characters.
+
+#
+# Rules 4 Ignore Format and Extend characters,
+# except when they appear at the beginning of a region of text.
+#
+$KatakanaEx = $Katakana ($Extend | $Format)*;
+$ALetterEx = $ALetterPlus ($Extend | $Format)*;
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidLetterEx = $MidLetter ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+
+$Hiragana = [\p{script=Hiragana}];
+$Ideographic = [\p{Ideographic}];
+$HiraganaEx = $Hiragana ($Extend | $Format)*;
+$IdeographicEx = $Ideographic ($Extend | $Format)*;
+
+## -------------------------------------------------
+
+!!forward;
+
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
+# of a region of Text. The rule here comes into play when the start of text
+# begins with a group of Format chars, or with a "word" consisting of a single
+# char that is not in any of the listed word break categories followed by
+# format char(s).
+[^$CR $LF $Newline]? ($Extend | $Format)+;
+
+$NumericEx {100};
+$ALetterEx {200};
+$KatakanaEx {300}; # note: these status values override those from rule 5
+$HiraganaEx {300}; # by virtual of being numerically larger.
+$IdeographicEx {400}; #
+
+#
+# rule 5
+# Do not break between most letters.
+#
+$ALetterEx $ALetterEx {200};
+
+# rule 6 and 7
+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+
+# rule 8
+
+$NumericEx $NumericEx {100};
+
+# rule 9
+
+$ALetterEx $NumericEx {200};
+
+# rule 10
+
+$NumericEx $ALetterEx {200};
+
+# rule 11 and 12
+
+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+
+# rule 13
+
+$KatakanaEx $KatakanaEx {300};
+
+# rule 13a/b
+
+$ALetterEx $ExtendNumLetEx {200}; # (13a)
+$NumericEx $ExtendNumLetEx {100}; # (13a)
+$KatakanaEx $ExtendNumLetEx {300}; # (13a)
+$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
+
+$ExtendNumLetEx $ALetterEx {200}; # (13b)
+$ExtendNumLetEx $NumericEx {100}; # (13b)
+$ExtendNumLetEx $KatakanaEx {300}; # (13b)
Modified: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java Wed Feb 9 17:07:46 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.icu.s
import java.io.IOException;
import java.io.InputStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
@@ -44,20 +46,24 @@ import com.ibm.icu.util.ULocale;
*/
public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
/** Token type for words containing ideographic characters */
- public static final String WORD_IDEO = "<IDEOGRAPHIC>";
- /** Token type for words containing Japanese kana */
- public static final String WORD_KANA = "<KANA>";
+ public static final String WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+ /** Token type for words containing Japanese hiragana */
+ public static final String WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ /** Token type for words containing Japanese katakana */
+ public static final String WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+ /** Token type for words containing Korean hangul */
+ public static final String WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
/** Token type for words that contain letters */
- public static final String WORD_LETTER = "<ALPHANUM>";
+ public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Token type for words that appear to be numbers */
- public static final String WORD_NUMBER = "<NUM>";
+ public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/*
* the default breakiterators in use. these can be expensive to
* instantiate, cheap to clone.
*/
private static final BreakIterator rootBreakIterator =
- BreakIterator.getWordInstance(ULocale.ROOT);
+ readBreakIterator("Default.brk");
private static final BreakIterator thaiBreakIterator =
BreakIterator.getWordInstance(new ULocale("th_TH"));
private static final BreakIterator hebrewBreakIterator =
@@ -87,9 +93,9 @@ public class DefaultICUTokenizerConfig e
case RuleBasedBreakIterator.WORD_IDEO:
return WORD_IDEO;
case RuleBasedBreakIterator.WORD_KANA:
- return WORD_KANA;
+ return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
case RuleBasedBreakIterator.WORD_LETTER:
- return WORD_LETTER;
+ return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
case RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
default: /* some other custom code */
Added: lucene/dev/trunk/modules/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk?rev=1068979&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1068979&r1=1068978&r2=1068979&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Wed Feb 9 17:07:46 2011
@@ -128,11 +128,10 @@ public class TestICUTokenizer extends Ba
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
- * TODO: why do full-width numerics have no word-break prop?
*/
public void testChinese() throws Exception {
assertAnalyzesTo(a, "ææ¯ä¸å½äººã ï¼ï¼ï¼ï¼ ï¼´ï½
ï½ï½ï½ ",
- new String[] { "æ", "æ¯", "ä¸", "å½", "人", "tests"});
+ new String[] { "æ", "æ¯", "ä¸", "å½", "人", "1234", "tests"});
}
public void testEmpty() throws Exception {
@@ -221,4 +220,16 @@ public class TestICUTokenizer extends Ba
new String[] {"david", "has", "5000", "bones"},
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}