You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/12/03 21:11:36 UTC

svn commit: r1547561 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ lucene/analysis/icu/src/da...

Author: rmuir
Date: Tue Dec  3 20:11:35 2013
New Revision: 1547561

URL: http://svn.apache.org/r1547561
Log:
LUCENE-4381: upgrade ICU to icu4j 52.1

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
      - copied unchanged from r1547502, lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
    lucene/dev/branches/branch_4x/lucene/licenses/icu4j-52.1.jar.sha1
      - copied unchanged from r1547502, lucene/dev/trunk/lucene/licenses/icu4j-52.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/icu4j-52.1.jar.sha1
      - copied unchanged from r1547502, lucene/dev/trunk/solr/licenses/icu4j-52.1.jar.sha1
Removed:
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Hebrew.rbbi
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Lao.rbbi
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
    lucene/dev/branches/branch_4x/lucene/licenses/icu4j-49.1.jar.sha1
    lucene/dev/branches/branch_4x/solr/licenses/icu4j-49.1.jar.sha1
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Default.rbbi
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfc.txt
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc.txt
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/overview.html
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
    lucene/dev/branches/branch_4x/lucene/ivy-versions.properties   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Dec  3 20:11:35 2013
@@ -34,6 +34,8 @@ Build
 * LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
   (Uwe Schindler)
 
+* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
+
 Bug fixes
 
 * LUCENE-5285: Improved highlighting of multi-valued fields with

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro Tue Dec  3 20:11:35 2013
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-// Generated using ICU4J 49.1.0.0
+// Generated using ICU4J 52.1.0.0
 // by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
 
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro Tue Dec  3 20:11:35 2013
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-// Generated using ICU4J 49.1.0.0
+// Generated using ICU4J 52.1.0.0
 // by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
 
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Default.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Default.rbbi?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Default.rbbi (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/uax29/Default.rbbi Tue Dec  3 20:11:35 2013
@@ -14,27 +14,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Default RBBI rules, based on UAX#29.
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
 #
+# Copyright (C) 2002-2013, International Business Machines Corporation 
+# and others. All Rights Reserved.
+#
+# file:  word.txt
+#
+# ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
+#
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
+
+##############################################################################
+#
+#  Character class definitions from TR 29
+#
+##############################################################################
 
 !!chain;
 
+
 #
 #  Character Class Definitions.
 #
 
-$CR           = [\p{Word_Break = CR}];
-$LF           = [\p{Word_Break = LF}];
-$Newline      = [\p{Word_Break = Newline}];
-$Extend       = [\p{Word_Break = Extend}];
-$Format       = [\p{Word_Break = Format}];
-$Katakana     = [\p{Word_Break = Katakana}];
-$ALetter      = [\p{Word_Break = ALetter}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
-$MidLetter    = [\p{Word_Break = MidLetter}];
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidLetter          = [\p{Word_Break = MidLetter}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+
+$Han                = [:Han:];
+$Hiragana           = [:Hiragana:];
 
 
 #   Dictionary character set, for triggering language-based break engines. Currently
@@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendN
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.
 
-$dictionary   = [:LineBreak = Complex_Context:];
 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
-                                                             #  include the dictionary characters.
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
+$dictionary     = [$ComplexContext];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
 
 #
 #  Rules 4    Ignore Format and Extend characters, 
 #             except when they appear at the beginning of a region of text.
 #
-$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
-$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format)*;
+$NumericEx            = $Numeric            ($Extend |  $Format)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
 
-$Hiragana       = [\p{script=Hiragana}];
 $Ideographic    = [\p{Ideographic}];
 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
@@ -77,23 +112,31 @@ $CR $LF;
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
-#          format char(s).
+#          format char(s), or is not a CJK dictionary character.
 [^$CR $LF $Newline]? ($Extend |  $Format)+;
 
 $NumericEx {100};
 $ALetterEx {200};
+$HangulSyllable {200};
+$Hebrew_LetterEx{200};
 $KatakanaEx {300};       # note:  these status values override those from rule 5
-$HiraganaEx {300};       #        by virtual of being numerically larger.
+$HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #
 
 #
 # rule 5
 #    Do not break between most letters.
 #
-$ALetterEx $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
 
 # rule 6 and 7
-$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
+
+# rule 7a
+$Hebrew_LetterEx $Single_QuoteEx {200};
+
+# rule 7b and 7c
+$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
 
 # rule 8
 
@@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
 
 # rule 9
 
-$ALetterEx $NumericEx {200};
+($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
 
 # rule 10
 
-$NumericEx $ALetterEx {200};
+$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
 
 # rule 11 and 12 
 
-$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
 
 # rule 13
-
 $KatakanaEx  $KatakanaEx {300};
 
 # rule 13a/b
 
-$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
-$NumericEx      $ExtendNumLetEx {100};    #  (13a)
-$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
-$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
-
-$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
-$ExtendNumLetEx $NumericEx  {100};    #  (13b)
-$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
+$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
+$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
+$NumericEx       $ExtendNumLetEx {100};    #  (13a)
+$KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
+$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
+
+$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
+$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
+$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
+$ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
+
+# rule 13c
+
+$Regional_IndicatorEx $Regional_IndicatorEx;
+
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt Tue Dec  3 20:11:35 2013
@@ -78,7 +78,6 @@ FF0D>002D
 ## Space Folding
 # Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
 1680>0020
-180E>0020
 
 ## Spacing Accents folding (done by kd)
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfc.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfc.txt?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfc.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfc.txt Tue Dec  3 20:11:35 2013
@@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
 # file name: nfc.txt
@@ -7,7 +7,7 @@
 #
 # Complete data for Unicode NFC normalization.
 
-* Unicode 6.1.0
+* Unicode 6.3.0
 
 # Canonical_Combining_Class (ccc) values
 0300..0314:230

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc.txt?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc.txt Tue Dec  3 20:11:35 2013
@@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
 # file name: nfkc.txt
@@ -11,7 +11,7 @@
 # to NFKC one-way mappings.
 # Use this file as the second gennorm2 input file after nfc.txt.
 
-* Unicode 6.1.0
+* Unicode 6.3.0
 
 00A0>0020
 00A8>0020 0308

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt Tue Dec  3 20:11:35 2013
@@ -1,5 +1,5 @@
 # Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2013 Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 # For documentation, see http://www.unicode.org/reports/tr44/
 #
@@ -12,7 +12,7 @@
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
 # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
 
-* Unicode 6.1.0
+* Unicode 6.3.0
 
 0041>0061
 0042>0062
@@ -537,6 +537,7 @@
 0555>0585
 0556>0586
 0587>0565 0582
+061C>
 0675>0627 0674
 0676>0648 0674
 0677>06C7 0674
@@ -627,7 +628,7 @@
 10FC>10DC
 115F..1160>
 17B4..17B5>
-180B..180D>
+180B..180E>
 1D2C>0061
 1D2D>00E6
 1D2E>0062

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java Tue Dec  3 20:11:35 2013
@@ -21,7 +21,6 @@ import java.text.CharacterIterator;
 
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;
 
@@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
   }
 
   /**
-   * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
-   * treat it like a generic BreakIterator If its any other
-   * RuleBasedBreakIterator, the rule status can be used for token type. If its
+   * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
    * any other BreakIterator, the rulestatus method is not available, so treat
    * it like a generic BreakIterator.
    */
   static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
-    if (breakIterator instanceof RuleBasedBreakIterator
-        && !(breakIterator instanceof DictionaryBasedBreakIterator))
+    if (breakIterator instanceof RuleBasedBreakIterator)
       return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
     else
       return new BIWrapper(breakIterator);

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java Tue Dec  3 20:11:35 2013
@@ -41,12 +41,13 @@ final class CompositeBreakIterator {
   private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
 
   private BreakIteratorWrapper rbbi;
-  private final ScriptIterator scriptIterator = new ScriptIterator();
+  private final ScriptIterator scriptIterator;
 
   private char text[];
 
   CompositeBreakIterator(ICUTokenizerConfig config) {
     this.config = config;
+    this.scriptIterator = new ScriptIterator(config.combineCJ());
   }
 
   /**

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java Tue Dec  3 20:11:35 2013
@@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
  * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), 
  * but with the following tailorings:
  * <ul>
- *   <li>Thai text is broken into words with a 
- *   {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
- *   <li>Lao, Myanmar, and Khmer text is broken into syllables
+ *   <li>Thai, Lao, and CJK text is broken into words with a dictionary. 
+ *   <li>Myanmar, and Khmer text is broken into syllables
  *   based on custom BreakIterator rules.
- *   <li>Hebrew text has custom tailorings to handle special cases
- *   involving punctuation.
  * </ul>
  * @lucene.experimental
  */
@@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig e
    * the default breakiterators in use. these can be expensive to
    * instantiate, cheap to clone.
    */  
-  private static final BreakIterator rootBreakIterator = 
+  // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
+  // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
+  private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
+  // the same as ROOT, except no dictionary segmentation for cjk
+  private static final BreakIterator defaultBreakIterator = 
     readBreakIterator("Default.brk");
-  private static final BreakIterator thaiBreakIterator = 
-    BreakIterator.getWordInstance(new ULocale("th_TH"));
-  private static final BreakIterator hebrewBreakIterator = 
-    readBreakIterator("Hebrew.brk");
   private static final BreakIterator khmerBreakIterator = 
     readBreakIterator("Khmer.brk");
-  private static final BreakIterator laoBreakIterator = 
-    new LaoBreakIterator(readBreakIterator("Lao.brk"));
   private static final BreakIterator myanmarBreakIterator = 
     readBreakIterator("Myanmar.brk");
   
+  // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
+  private final boolean cjkAsWords;
+  
   /** 
    * Creates a new config. This object is lightweight, but the first
    * time the class is referenced, breakiterators will be initialized.
+   * @param cjkAsWords true if cjk text should undergo dictionary-based segmentation, 
+   *                   otherwise text will be segmented according to UAX#29 defaults.
+   *                   If this is true, all Han+Hiragana+Katakana words will be tagged as
+   *                   IDEOGRAPHIC.
    */
-  public DefaultICUTokenizerConfig() {}
+  public DefaultICUTokenizerConfig(boolean cjkAsWords) { 
+    this.cjkAsWords = cjkAsWords;
+  }
+  
+  @Override
+  public boolean combineCJ() {
+    return cjkAsWords;
+  }
 
   @Override
   public BreakIterator getBreakIterator(int script) {
     switch(script) {
-      case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
-      case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
       case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
-      case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
       case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
-      default: return (BreakIterator)rootBreakIterator.clone();
+      case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
+      default: return (BreakIterator)defaultBreakIterator.clone();
     }
   }
 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java Tue Dec  3 20:11:35 2013
@@ -68,7 +68,7 @@ public final class ICUTokenizer extends 
    * @see DefaultICUTokenizerConfig
    */
   public ICUTokenizer(Reader input) {
-    this(input, new DefaultICUTokenizerConfig());
+    this(input, new DefaultICUTokenizerConfig(true));
   }
 
   /**

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java Tue Dec  3 20:11:35 2013
@@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig
   /** Return a token type value for a given script and BreakIterator
    *  rule status. */
   public abstract String getType(int script, int ruleStatus);
+  /** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
+  public abstract boolean combineCJ();
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java Tue Dec  3 20:11:35 2013
@@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIt
  * <pre class="prettyprint" >
  * &lt;fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
- *     &lt;tokenizer class="solr.ICUTokenizerFactory"
+ *     &lt;tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
  *                rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
@@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends
   static final String RULEFILES = "rulefiles";
   private final Map<Integer,String> tailored;
   private ICUTokenizerConfig config;
+  private final boolean cjkAsWords;
   
   /** Creates a new ICUTokenizerFactory */
   public ICUTokenizerFactory(Map<String,String> args) {
@@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends
         tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
       }
     }
+    cjkAsWords = getBoolean(args, "cjkAsWords", true);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends
   public void inform(ResourceLoader loader) throws IOException {
     assert tailored != null : "init must be called first!";
     if (tailored.isEmpty()) {
-      config = new DefaultICUTokenizerConfig();
+      config = new DefaultICUTokenizerConfig(cjkAsWords);
     } else {
       final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
       for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
@@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends
         String resourcePath = entry.getValue();
         breakers[code] = parseRules(resourcePath, loader);
       }
-      config = new DefaultICUTokenizerConfig() {
+      config = new DefaultICUTokenizerConfig(cjkAsWords) {
         
         @Override
         public BreakIterator getBreakIterator(int script) {

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java Tue Dec  3 20:11:35 2013
@@ -59,6 +59,15 @@ final class ScriptIterator {
   private int scriptStart;
   private int scriptLimit;
   private int scriptCode;
+  
+  private final boolean combineCJ;
+  
+  /**
+   * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
+   */
+  ScriptIterator(boolean combineCJ) {
+    this.combineCJ = combineCJ;
+  }
 
   /**
    * Get the start of this script run
@@ -162,10 +171,24 @@ final class ScriptIterator {
   }
 
   /** fast version of UScript.getScript(). Basic Latin is an array lookup */
-  private static int getScript(int codepoint) {
-    if (0 <= codepoint && codepoint < basicLatin.length)
+  private int getScript(int codepoint) {
+    if (0 <= codepoint && codepoint < basicLatin.length) {
       return basicLatin[codepoint];
-    else
-      return UScript.getScript(codepoint);
+    } else {
+      int script = UScript.getScript(codepoint);
+      if (combineCJ) {
+        if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
+          return UScript.JAPANESE;
+        } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
+          // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
+          // they are treated as punctuation. we currently have no cleaner way to fix this!
+          return UScript.LATIN; 
+        } else {
+          return script;
+        }
+      } else {
+        return script;
+      }
+    }
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java Tue Dec  3 20:11:35 2013
@@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends
 
   @Override
   public void reflectWith(AttributeReflector reflector) {
-    reflector.reflect(ScriptAttribute.class, "script", getName());
+    // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to 
+    // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset), 
+    // but this is just to help prevent confusion.
+    String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
+    reflector.reflect(ScriptAttribute.class, "script", name);
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/overview.html?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/overview.html (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/overview.html Tue Dec  3 20:11:35 2013
@@ -353,7 +353,7 @@ and 
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 6.1). However, some users who wish
+the most recent version of Unicode (currently 6.3). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Tue Dec  3 20:11:35 2013
@@ -43,7 +43,7 @@ public class TestICUTokenizer extends Ba
     sb.append(whitespace);
     sb.append("testing 1234");
     String input = sb.toString();
-    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
     assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
   }
   
@@ -53,7 +53,7 @@ public class TestICUTokenizer extends Ba
       sb.append('a');
     }
     String input = sb.toString();
-    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
     char token[] = new char[4096];
     Arrays.fill(token, 'a');
     String expectedToken = new String(token);
@@ -70,7 +70,7 @@ public class TestICUTokenizer extends Ba
     @Override
     protected TokenStreamComponents createComponents(String fieldName,
         Reader reader) {
-      Tokenizer tokenizer = new ICUTokenizer(reader);
+      Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
       TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
       return new TokenStreamComponents(tokenizer, filter);
     }
@@ -119,6 +119,7 @@ public class TestICUTokenizer extends Ba
   
   public void testLao() throws Exception {
     assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
+    assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
   }
   
   public void testThai() throws Exception {
@@ -139,6 +140,13 @@ public class TestICUTokenizer extends Ba
         new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
   }
   
+  public void testHebrew() throws Exception {
+    assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח",
+        new String[] { "דנקנר", "תקף", "את", "הדו\"ח" });
+    assertAnalyzesTo(a, "חברת בת של מודי'ס",
+        new String[] { "חברת", "בת", "של", "מודי'ס" });
+  }
+  
   public void testEmpty() throws Exception {
     assertAnalyzesTo(a, "", new String[] {});
     assertAnalyzesTo(a, ".", new String[] {});

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java Tue Dec  3 20:11:35 2013
@@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter ext
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer source = new ICUTokenizer(reader);
+      Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
       TokenStream result = new CJKBigramFilter(source);
       return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
     }
@@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter ext
   private Analyzer analyzer2 = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer source = new ICUTokenizer(reader);
+      Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
       // we put this before the CJKBigramFilter, because the normalization might combine
       // some halfwidth katakana forms, which will affect the bigramming.
       TokenStream result = new ICUNormalizer2Filter(source);

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java Tue Dec  3 20:11:35 2013
@@ -62,7 +62,7 @@ import java.util.regex.Pattern;
 public class GenerateUTR30DataFiles {
   private static final String ICU_SVN_TAG_URL
       = "http://source.icu-project.org/repos/icu/icu/tags";
-  private static final String ICU_RELEASE_TAG = "release-49-1-2";
+  private static final String ICU_RELEASE_TAG = "release-52-1";
   private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
   private static final String NFC_TXT = "nfc.txt";
   private static final String NFKC_TXT = "nfkc.txt";

Modified: lucene/dev/branches/branch_4x/lucene/ivy-versions.properties
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/ivy-versions.properties?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/ivy-versions.properties (original)
+++ lucene/dev/branches/branch_4x/lucene/ivy-versions.properties Tue Dec  3 20:11:35 2013
@@ -16,7 +16,7 @@ com.carrotsearch.randomizedtesting.versi
 /com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
 /com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
 /com.googlecode.mp4parser/isoparser = 1.0-RC-1
-/com.ibm.icu/icu4j = 49.1
+/com.ibm.icu/icu4j = 52.1
 /com.spatial4j/spatial4j = 0.3
 /com.sun.jersey/jersey-core = 1.16
 /commons-beanutils/commons-beanutils = 1.7.0

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1547561&r1=1547560&r2=1547561&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Dec  3 20:11:35 2013
@@ -635,7 +635,7 @@ public abstract class BaseTokenStreamTes
     int charUpto = 0;
     final StringBuilder sb = new StringBuilder();
     while (charUpto < s.length()) {
-      final int c = s.codePointAt(charUpto);
+      final int c = s.charAt(charUpto);
       if (c == 0xa) {
         // Strangely, you cannot put \ u000A into Java
         // sources (not in a comment nor a string
@@ -655,7 +655,7 @@ public abstract class BaseTokenStreamTes
         // don't escape...
         sb.append(String.format(Locale.ROOT, "\\u%04x", c));
       }
-      charUpto += Character.charCount(c);
+      charUpto++;
     }
     return sb.toString();
   }