You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/12/03 19:05:24 UTC
svn commit: r1547502 - in /lucene/dev/trunk: lucene/
lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/
lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/
lucene/analysis/icu/src/data/uax29/ lucene/analysis/icu/src...
Author: rmuir
Date: Tue Dec 3 18:05:23 2013
New Revision: 1547502
URL: http://svn.apache.org/r1547502
Log:
LUCENE-4381: upgrade ICU to icu4j 52.1
Added:
lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java (with props)
lucene/dev/trunk/lucene/licenses/icu4j-52.1.jar.sha1 (with props)
lucene/dev/trunk/solr/licenses/icu4j-52.1.jar.sha1 (with props)
Removed:
lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Hebrew.rbbi
lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Lao.rbbi
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
lucene/dev/trunk/lucene/licenses/icu4j-49.1.jar.sha1
lucene/dev/trunk/solr/licenses/icu4j-49.1.jar.sha1
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Default.rbbi
lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfc.txt
lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc.txt
lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
lucene/dev/trunk/lucene/analysis/icu/src/java/overview.html
lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
lucene/dev/trunk/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
lucene/dev/trunk/lucene/ivy-versions.properties
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Dec 3 18:05:23 2013
@@ -89,6 +89,8 @@ Build
* LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
(Uwe Schindler)
+* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
+
Bug fixes
* LUCENE-5285: Improved highlighting of multi-valued fields with
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro Tue Dec 3 18:05:23 2013
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-// Generated using ICU4J 49.1.0.0
+// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro Tue Dec 3 18:05:23 2013
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-// Generated using ICU4J 49.1.0.0
+// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
Modified: lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Default.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Default.rbbi?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Default.rbbi (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/data/uax29/Default.rbbi Tue Dec 3 18:05:23 2013
@@ -14,27 +14,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-# Default RBBI rules, based on UAX#29.
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
#
+# Copyright (C) 2002-2013, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+# file: word.txt
+#
+# ICU Word Break Rules
+# See Unicode Standard Annex #29.
+# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
+#
+# Note: Updates to word.txt will usually need to be merged into
+# word_POSIX.txt also.
+
+##############################################################################
+#
+# Character class definitions from TR 29
+#
+##############################################################################
!!chain;
+
#
# Character Class Definitions.
#
-$CR = [\p{Word_Break = CR}];
-$LF = [\p{Word_Break = LF}];
-$Newline = [\p{Word_Break = Newline}];
-$Extend = [\p{Word_Break = Extend}];
-$Format = [\p{Word_Break = Format}];
-$Katakana = [\p{Word_Break = Katakana}];
-$ALetter = [\p{Word_Break = ALetter}];
-$MidNumLet = [\p{Word_Break = MidNumLet}];
-$MidLetter = [\p{Word_Break = MidLetter}];
-$MidNum = [\p{Word_Break = MidNum}];
-$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$CR = [\p{Word_Break = CR}];
+$LF = [\p{Word_Break = LF}];
+$Newline = [\p{Word_Break = Newline}];
+$Extend = [\p{Word_Break = Extend}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format = [\p{Word_Break = Format}];
+$Katakana = [\p{Word_Break = Katakana}];
+$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
+$ALetter = [\p{Word_Break = ALetter}];
+$Single_Quote = [\p{Word_Break = Single_Quote}];
+$Double_Quote = [\p{Word_Break = Double_Quote}];
+$MidNumLet = [\p{Word_Break = MidNumLet}];
+$MidLetter = [\p{Word_Break = MidLetter}];
+$MidNum = [\p{Word_Break = MidNum}];
+$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+
+$Han = [:Han:];
+$Hiragana = [:Hiragana:];
# Dictionary character set, for triggering language-based break engines. Currently
@@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendN
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
-$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
-$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
- # include the dictionary characters.
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji = [$Han $Hiragana $Katakana];
+$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
+$dictionary = [$ComplexContext];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
#
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
-$KatakanaEx = $Katakana ($Extend | $Format)*;
-$ALetterEx = $ALetterPlus ($Extend | $Format)*;
-$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
-$MidLetterEx = $MidLetter ($Extend | $Format)*;
-$MidNumEx = $MidNum ($Extend | $Format)*;
-$NumericEx = $Numeric ($Extend | $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+$KatakanaEx = $Katakana ($Extend | $Format)*;
+$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
+$ALetterEx = $ALetterPlus ($Extend | $Format)*;
+$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
+$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
+$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
+$MidLetterEx = $MidLetter ($Extend | $Format)*;
+$MidNumEx = $MidNum ($Extend | $Format)*;
+$NumericEx = $Numeric ($Extend | $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
-$Hiragana = [\p{script=Hiragana}];
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
@@ -77,23 +112,31 @@ $CR $LF;
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
-# format char(s).
+# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
+$HangulSyllable {200};
+$Hebrew_LetterEx{200};
$KatakanaEx {300}; # note: these status values override those from rule 5
-$HiraganaEx {300}; # by virtual of being numerically larger.
+$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
#
# rule 5
# Do not break between most letters.
#
-$ALetterEx $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 6 and 7
-$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
+
+# rule 7a
+$Hebrew_LetterEx $Single_QuoteEx {200};
+
+# rule 7b and 7c
+$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
# rule 8
@@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
# rule 9
-$ALetterEx $NumericEx {200};
+($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
# rule 10
-$NumericEx $ALetterEx {200};
+$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
-$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
# rule 13
-
$KatakanaEx $KatakanaEx {300};
# rule 13a/b
-$ALetterEx $ExtendNumLetEx {200}; # (13a)
-$NumericEx $ExtendNumLetEx {100}; # (13a)
-$KatakanaEx $ExtendNumLetEx {300}; # (13a)
-$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
-
-$ExtendNumLetEx $ALetterEx {200}; # (13b)
-$ExtendNumLetEx $NumericEx {100}; # (13b)
-$ExtendNumLetEx $KatakanaEx {300}; # (13b)
+$ALetterEx $ExtendNumLetEx {200}; # (13a)
+$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
+$NumericEx $ExtendNumLetEx {100}; # (13a)
+$KatakanaEx $ExtendNumLetEx {300}; # (13a)
+$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
+
+$ExtendNumLetEx $ALetterEx {200}; # (13b)
+$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
+$ExtendNumLetEx $NumericEx {100}; # (13b)
+$ExtendNumLetEx $KatakanaEx {300}; # (13b)
+
+# rule 13c
+
+$Regional_IndicatorEx $Regional_IndicatorEx;
+
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
Modified: lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt Tue Dec 3 18:05:23 2013
@@ -78,7 +78,6 @@ FF0D>002D
## Space Folding
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
1680>0020
-180E>0020
## Spacing Accents folding (done by kd)
Modified: lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfc.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfc.txt?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfc.txt (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfc.txt Tue Dec 3 18:05:23 2013
@@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfc.txt
@@ -7,7 +7,7 @@
#
# Complete data for Unicode NFC normalization.
-* Unicode 6.1.0
+* Unicode 6.3.0
# Canonical_Combining_Class (ccc) values
0300..0314:230
Modified: lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc.txt?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc.txt (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc.txt Tue Dec 3 18:05:23 2013
@@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc.txt
@@ -11,7 +11,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
-* Unicode 6.1.0
+* Unicode 6.3.0
00A0>0020
00A8>0020 0308
Modified: lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt Tue Dec 3 18:05:23 2013
@@ -1,5 +1,5 @@
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2013 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
-* Unicode 6.1.0
+* Unicode 6.3.0
0041>0061
0042>0062
@@ -537,6 +537,7 @@
0555>0585
0556>0586
0587>0565 0582
+061C>
0675>0627 0674
0676>0648 0674
0677>06C7 0674
@@ -627,7 +628,7 @@
10FC>10DC
115F..1160>
17B4..17B5>
-180B..180D>
+180B..180E>
1D2C>0061
1D2D>00E6
1D2E>0062
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java Tue Dec 3 18:05:23 2013
@@ -21,7 +21,6 @@ import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
@@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
}
/**
- * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
- * treat it like a generic BreakIterator If its any other
- * RuleBasedBreakIterator, the rule status can be used for token type. If its
+ * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
- if (breakIterator instanceof RuleBasedBreakIterator
- && !(breakIterator instanceof DictionaryBasedBreakIterator))
+ if (breakIterator instanceof RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java Tue Dec 3 18:05:23 2013
@@ -41,12 +41,13 @@ final class CompositeBreakIterator {
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
private BreakIteratorWrapper rbbi;
- private final ScriptIterator scriptIterator = new ScriptIterator();
+ private final ScriptIterator scriptIterator;
private char text[];
CompositeBreakIterator(ICUTokenizerConfig config) {
this.config = config;
+ this.scriptIterator = new ScriptIterator(config.combineCJ());
}
/**
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java Tue Dec 3 18:05:23 2013
@@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings:
* <ul>
- * <li>Thai text is broken into words with a
- * {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
- * <li>Lao, Myanmar, and Khmer text is broken into syllables
+ * <li>Thai, Lao, and CJK text is broken into words with a dictionary.
+ * <li>Myanmar, and Khmer text is broken into syllables
* based on custom BreakIterator rules.
- * <li>Hebrew text has custom tailorings to handle special cases
- * involving punctuation.
* </ul>
* @lucene.experimental
*/
@@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig e
* the default breakiterators in use. these can be expensive to
* instantiate, cheap to clone.
*/
- private static final BreakIterator rootBreakIterator =
+ // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
+ // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
+ private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
+ // the same as ROOT, except no dictionary segmentation for cjk
+ private static final BreakIterator defaultBreakIterator =
readBreakIterator("Default.brk");
- private static final BreakIterator thaiBreakIterator =
- BreakIterator.getWordInstance(new ULocale("th_TH"));
- private static final BreakIterator hebrewBreakIterator =
- readBreakIterator("Hebrew.brk");
private static final BreakIterator khmerBreakIterator =
readBreakIterator("Khmer.brk");
- private static final BreakIterator laoBreakIterator =
- new LaoBreakIterator(readBreakIterator("Lao.brk"));
private static final BreakIterator myanmarBreakIterator =
readBreakIterator("Myanmar.brk");
+ // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
+ private final boolean cjkAsWords;
+
/**
* Creates a new config. This object is lightweight, but the first
* time the class is referenced, breakiterators will be initialized.
+ * @param cjkAsWords true if cjk text should undergo dictionary-based segmentation,
+ * otherwise text will be segmented according to UAX#29 defaults.
+ * If this is true, all Han+Hiragana+Katakana words will be tagged as
+ * IDEOGRAPHIC.
*/
- public DefaultICUTokenizerConfig() {}
+ public DefaultICUTokenizerConfig(boolean cjkAsWords) {
+ this.cjkAsWords = cjkAsWords;
+ }
+
+ @Override
+ public boolean combineCJ() {
+ return cjkAsWords;
+ }
@Override
public BreakIterator getBreakIterator(int script) {
switch(script) {
- case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
- case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
- case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
- default: return (BreakIterator)rootBreakIterator.clone();
+ case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
+ default: return (BreakIterator)defaultBreakIterator.clone();
}
}
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Tue Dec 3 18:05:23 2013
@@ -68,7 +68,7 @@ public final class ICUTokenizer extends
* @see DefaultICUTokenizerConfig
*/
public ICUTokenizer(Reader input) {
- this(input, new DefaultICUTokenizerConfig());
+ this(input, new DefaultICUTokenizerConfig(true));
}
/**
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java Tue Dec 3 18:05:23 2013
@@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig
/** Return a token type value for a given script and BreakIterator
* rule status. */
public abstract String getType(int script, int ruleStatus);
+ /** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
+ public abstract boolean combineCJ();
}
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java Tue Dec 3 18:05:23 2013
@@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIt
* <pre class="prettyprint" >
* <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
- * <tokenizer class="solr.ICUTokenizerFactory"
+ * <tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
* </analyzer>
* </fieldType></pre>
@@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends
static final String RULEFILES = "rulefiles";
private final Map<Integer,String> tailored;
private ICUTokenizerConfig config;
+ private final boolean cjkAsWords;
/** Creates a new ICUTokenizerFactory */
public ICUTokenizerFactory(Map<String,String> args) {
@@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
}
}
+ cjkAsWords = getBoolean(args, "cjkAsWords", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends
public void inform(ResourceLoader loader) throws IOException {
assert tailored != null : "init must be called first!";
if (tailored.isEmpty()) {
- config = new DefaultICUTokenizerConfig();
+ config = new DefaultICUTokenizerConfig(cjkAsWords);
} else {
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
@@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends
String resourcePath = entry.getValue();
breakers[code] = parseRules(resourcePath, loader);
}
- config = new DefaultICUTokenizerConfig() {
+ config = new DefaultICUTokenizerConfig(cjkAsWords) {
@Override
public BreakIterator getBreakIterator(int script) {
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java Tue Dec 3 18:05:23 2013
@@ -59,6 +59,15 @@ final class ScriptIterator {
private int scriptStart;
private int scriptLimit;
private int scriptCode;
+
+ private final boolean combineCJ;
+
+ /**
+ * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
+ */
+ ScriptIterator(boolean combineCJ) {
+ this.combineCJ = combineCJ;
+ }
/**
* Get the start of this script run
@@ -162,10 +171,24 @@ final class ScriptIterator {
}
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
- private static int getScript(int codepoint) {
- if (0 <= codepoint && codepoint < basicLatin.length)
+ private int getScript(int codepoint) {
+ if (0 <= codepoint && codepoint < basicLatin.length) {
return basicLatin[codepoint];
- else
- return UScript.getScript(codepoint);
+ } else {
+ int script = UScript.getScript(codepoint);
+ if (combineCJ) {
+ if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
+ return UScript.JAPANESE;
+ } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
+ // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
+ // they are treated as punctuation. we currently have no cleaner way to fix this!
+ return UScript.LATIN;
+ } else {
+ return script;
+ }
+ } else {
+ return script;
+ }
+ }
}
}
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java Tue Dec 3 18:05:23 2013
@@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends
@Override
public void reflectWith(AttributeReflector reflector) {
- reflector.reflect(ScriptAttribute.class, "script", getName());
+ // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
+ // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
+ // but this is just to help prevent confusion.
+ String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
+ reflector.reflect(ScriptAttribute.class, "script", name);
}
}
Modified: lucene/dev/trunk/lucene/analysis/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/overview.html?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/overview.html (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/overview.html Tue Dec 3 18:05:23 2013
@@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 6.1). However, some users who wish
+the most recent version of Unicode (currently 6.3). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
Modified: lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Tue Dec 3 18:05:23 2013
@@ -42,7 +42,7 @@ public class TestICUTokenizer extends Ba
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
- ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+ ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@@ -52,7 +52,7 @@ public class TestICUTokenizer extends Ba
sb.append('a');
}
String input = sb.toString();
- ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+ ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
char token[] = new char[4096];
Arrays.fill(token, 'a');
String expectedToken = new String(token);
@@ -69,7 +69,7 @@ public class TestICUTokenizer extends Ba
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- Tokenizer tokenizer = new ICUTokenizer(reader);
+ Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
@@ -118,6 +118,7 @@ public class TestICUTokenizer extends Ba
public void testLao() throws Exception {
assertAnalyzesTo(a, "àºàº§à»àº²àºàºàº", new String[] { "àºàº§à»àº²", "àºàºàº" });
+ assertAnalyzesTo(a, "àºàº²àºªàº²àº¥àº²àº§", new String[] { "àºàº²àºªàº²", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
}
public void testThai() throws Exception {
@@ -138,6 +139,13 @@ public class TestICUTokenizer extends Ba
new String[] { "æ", "æ¯", "ä¸", "å½", "人", "1234", "tests"});
}
+ public void testHebrew() throws Exception {
+ assertAnalyzesTo(a, "×× ×§× ×¨ תקף ×ת ×××\"×",
+ new String[] { "×× ×§× ×¨", "תקף", "×ת", "×××\"×" });
+ assertAnalyzesTo(a, "××רת ×ת ×©× ××××'ס",
+ new String[] { "××רת", "×ת", "ש×", "××××'ס" });
+ }
+
public void testEmpty() throws Exception {
assertAnalyzesTo(a, "", new String[] {});
assertAnalyzesTo(a, ".", new String[] {});
Added: lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java?rev=1547502&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java (added)
+++ lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java Tue Dec 3 18:05:23 2013
@@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Random;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+/**
+ * test ICUTokenizer with dictionary-based CJ segmentation
+ */
+public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new ICUTokenizer(reader));
+ }
+ };
+
+ /**
+ * test stolen from smartcn
+ */
+ public void testSimpleChinese() throws Exception {
+ assertAnalyzesTo(a, "æè´ä¹°äºéå
·åæè£
ã",
+ new String[] { "æ", "è´ä¹°", "äº", "éå
·", "å", "æè£
" }
+ );
+ }
+
+ public void testChineseNumerics() throws Exception {
+ assertAnalyzesTo(a, "ï¼ï¼ï¼ï¼", new String[] { "ï¼ï¼ï¼ï¼" });
+ assertAnalyzesTo(a, "é¢å
§åæ©ï¼ï¼ï¼ï¼ã",
+ new String[] { "é¢", "å
§", "åæ©", "ï¼ï¼ï¼ï¼" });
+ assertAnalyzesTo(a, "é¢å
§åæ©9483ã",
+ new String[] { "é¢", "å
§", "åæ©", "9483" });
+ }
+
+ /**
+ * test stolen from kuromoji
+ */
+ public void testSimpleJapanese() throws Exception {
+ assertAnalyzesTo(a, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã",
+ new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" }
+ );
+ }
+
+ public void testJapaneseTypes() throws Exception {
+ assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®åé£ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+ }
+
+ public void testKorean() throws Exception {
+ // Korean words
+ assertAnalyzesTo(a, "ìë
íì¸ì íê¸ì
ëë¤", new String[]{"ìë
íì¸ì", "íê¸ì
ëë¤"});
+ }
+
+ /** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */
+ public void testKoreanTypes() throws Exception {
+ assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
+ }
+
+ /** blast some random large strings through the analyzer */
+ public void testRandomHugeStrings() throws Exception {
+ Random random = random();
+ checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
+ }
+}
Modified: lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java Tue Dec 3 18:05:23 2013
@@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter ext
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer source = new ICUTokenizer(reader);
+ Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
}
@@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter ext
private Analyzer analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer source = new ICUTokenizer(reader);
+ Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);
Modified: lucene/dev/trunk/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java Tue Dec 3 18:05:23 2013
@@ -62,7 +62,7 @@ import java.util.regex.Pattern;
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
- private static final String ICU_RELEASE_TAG = "release-49-1-2";
+ private static final String ICU_RELEASE_TAG = "release-52-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
Modified: lucene/dev/trunk/lucene/ivy-versions.properties
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/ivy-versions.properties?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/ivy-versions.properties (original)
+++ lucene/dev/trunk/lucene/ivy-versions.properties Tue Dec 3 18:05:23 2013
@@ -46,7 +46,7 @@ com.google.inject.guice.version = 3.0
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
/com.googlecode.mp4parser/isoparser = 1.0-RC-1
-/com.ibm.icu/icu4j = 49.1
+/com.ibm.icu/icu4j = 52.1
/com.spatial4j/spatial4j = 0.3
com.sun.jersey.version = 1.8
Added: lucene/dev/trunk/lucene/licenses/icu4j-52.1.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/licenses/icu4j-52.1.jar.sha1?rev=1547502&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/licenses/icu4j-52.1.jar.sha1 (added)
+++ lucene/dev/trunk/lucene/licenses/icu4j-52.1.jar.sha1 Tue Dec 3 18:05:23 2013
@@ -0,0 +1 @@
+7dbc327670673acd14b487d120f05747d712c1c0
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1547502&r1=1547501&r2=1547502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Dec 3 18:05:23 2013
@@ -635,7 +635,7 @@ public abstract class BaseTokenStreamTes
int charUpto = 0;
final StringBuilder sb = new StringBuilder();
while (charUpto < s.length()) {
- final int c = s.codePointAt(charUpto);
+ final int c = s.charAt(charUpto);
if (c == 0xa) {
// Strangely, you cannot put \ u000A into Java
// sources (not in a comment nor a string
@@ -655,7 +655,7 @@ public abstract class BaseTokenStreamTes
// don't escape...
sb.append(String.format(Locale.ROOT, "\\u%04x", c));
}
- charUpto += Character.charCount(c);
+ charUpto++;
}
return sb.toString();
}
Added: lucene/dev/trunk/solr/licenses/icu4j-52.1.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/licenses/icu4j-52.1.jar.sha1?rev=1547502&view=auto
==============================================================================
--- lucene/dev/trunk/solr/licenses/icu4j-52.1.jar.sha1 (added)
+++ lucene/dev/trunk/solr/licenses/icu4j-52.1.jar.sha1 Tue Dec 3 18:05:23 2013
@@ -0,0 +1 @@
+7dbc327670673acd14b487d120f05747d712c1c0