You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2016/02/18 02:10:13 UTC

lucene-solr git commit: LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.

Repository: lucene-solr
Updated Branches:
  refs/heads/master 31437c9b4 -> b0a43aa1b


LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b0a43aa1
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b0a43aa1
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b0a43aa1

Branch: refs/heads/master
Commit: b0a43aa1b2819133ec2ee69545a62358baf440b3
Parents: 31437c9
Author: Robert Muir <rm...@apache.org>
Authored: Wed Feb 17 20:01:27 2016 -0500
Committer: Robert Muir <rm...@apache.org>
Committed: Wed Feb 17 20:10:02 2016 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   4 ++
 lucene/analysis/icu/src/data/uax29/Khmer.rbbi   |  61 -------------------
 .../icu/src/data/utr30/DiacriticFolding.txt     |   6 +-
 .../icu/src/data/utr30/NativeDigitFolding.txt   |  10 +++
 .../segmentation/DefaultICUTokenizerConfig.java |   7 +--
 lucene/analysis/icu/src/java/overview.html      |   2 +-
 .../analysis/icu/segmentation/Default.brk       | Bin 34320 -> 35264 bytes
 .../lucene/analysis/icu/segmentation/Khmer.brk  | Bin 17296 -> 0 bytes
 .../org/apache/lucene/analysis/icu/utr30.nrm    | Bin 53728 -> 53840 bytes
 .../icu/segmentation/TestICUTokenizer.java      |   3 +
 lucene/ivy-versions.properties                  |   2 +-
 lucene/licenses/icu4j-54.1.jar.sha1             |   1 -
 lucene/licenses/icu4j-56.1.jar.sha1             |   1 +
 solr/licenses/icu4j-54.1.jar.sha1               |   1 -
 solr/licenses/icu4j-56.1.jar.sha1               |   1 +
 15 files changed, 26 insertions(+), 73 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 24632de..eea3a1b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -127,6 +127,10 @@ Tests
   expression to encapsulate a statement that is expected to throw an exception.
   (Ryan Ernst)
 
+Other
+
+* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
+
 ======================= Lucene 5.6.0 =======================
 (No Changes)
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/uax29/Khmer.rbbi b/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
deleted file mode 100644
index 43be268..0000000
--- a/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
+++ /dev/null
@@ -1,61 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# 
-# Parses Khmer text, with orthographic syllable as token.
-#
-# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
-#
-# B = base character (consonant, independent vowel, etc)
-$KhmerBase = [\u1780-\u17B3];
-# R = robat
-$KhmerRobat = [\u17CC];
-# C = consonant shifter
-$KhmerShifter = [\u17C9\u17CA];
-# S = subscript consonant or independent vowel sign
-$KhmerSub = ([\u17D2] $KhmerBase);
-# V = dependent vowel sign
-$KhmerVowel = [\u17B4-\u17C5];
-# Z = zero-width joiner or non-joiner
-$KhmerZWC = [\u200C\u200D];
-# O = any other sign
-$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 
-
-$WordJoin = [:Line_Break=Word_Joiner:];
-
-$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
-
-$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
-
-#
-# default numerical definitions
-#
-$Extend       = [\p{Word_Break = Extend}];
-$Format       = [\p{Word_Break = Format}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
-
-!!forward;
-$KhmerJoinedSyllableEx {200};
-
-# default numeric rules
-$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
index 9830754..3772daf 100644
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@@ -62,7 +62,7 @@
 07A6..07B0>
 07EB..07F5>
 0818..0819>
-08E4..08FE>
+08E3..08FE>
 093C>
 094D>
 0951..0954>
@@ -149,7 +149,7 @@ AAF6>
 AB5B..AB5F>
 ABEC..ABED>
 FB1E>
-FE20..FE2D>
+FE20..FE2F>
 FF3E>
 FF40>
 FF70>
@@ -161,6 +161,7 @@ FFE3>
 11133..11134>
 11173>
 111C0>
+111CA..111CC>
 11235..11236>
 112E9..112EA>
 1133C>
@@ -171,6 +172,7 @@ FFE3>
 115BF..115C0>
 1163F>
 116B6..116B7>
+1172B>
 16AF0..16AF4>
 16F8F..16F9F>
 1D167..1D169>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
index 54cd114..62e6aef 100644
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@@ -540,6 +540,16 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 116C7>0037   # TAKRI DIGIT SEVEN
 116C8>0038   # TAKRI DIGIT EIGHT
 116C9>0039   # TAKRI DIGIT NINE
+11730>0030   # AHOM DIGIT ZERO
+11731>0031   # AHOM DIGIT ONE
+11732>0032   # AHOM DIGIT TWO
+11733>0033   # AHOM DIGIT THREE
+11734>0034   # AHOM DIGIT FOUR
+11735>0035   # AHOM DIGIT FIVE
+11736>0036   # AHOM DIGIT SIX
+11737>0037   # AHOM DIGIT SEVEN
+11738>0038   # AHOM DIGIT EIGHT
+11739>0039   # AHOM DIGIT NINE
 118E0>0030   # WARANG CITI DIGIT ZERO
 118E1>0031   # WARANG CITI DIGIT ONE
 118E2>0032   # WARANG CITI DIGIT TWO

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
index dbf9b2e..b33663d 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@@ -35,9 +35,7 @@ import com.ibm.icu.util.ULocale;
  * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), 
  * but with the following tailorings:
  * <ul>
- *   <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary. 
- *   <li>Khmer text is broken into syllables
- *   based on custom BreakIterator rules.
+ *   <li>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary. 
  * </ul>
  * @lucene.experimental
  */
@@ -65,8 +63,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   // the same as ROOT, except no dictionary segmentation for cjk
   private static final BreakIterator defaultBreakIterator = 
     readBreakIterator("Default.brk");
-  private static final BreakIterator khmerBreakIterator = 
-    readBreakIterator("Khmer.brk");
   
   // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
   private final boolean cjkAsWords;
@@ -91,7 +87,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   @Override
   public BreakIterator getBreakIterator(int script) {
     switch(script) {
-      case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
       case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
       default: return (BreakIterator)defaultBreakIterator.clone();
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html
index 5a836d9..abb2e2a 100644
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 6.3). However, some users who wish
+the most recent version of Unicode (currently 8.0). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
index af2727c..5b84797 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
deleted file mode 100644
index dc19835..0000000
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk and /dev/null differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
index cb0d934..2680264 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
index f60954f..6398b2c 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@@ -129,6 +129,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
         "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
   }
   
+  public void testKhmer() throws Exception {
+    assertAnalyzesTo(a, "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ", new String[] { "ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", "ខ្នង", "នេះ" });
+  }
   public void testLao() throws Exception {
     assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
     assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/ivy-versions.properties
----------------------------------------------------------------------
diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
index 5bb6d04..1f832e9 100644
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@@ -39,7 +39,7 @@ com.google.inject.guice.version = 3.0
 /com.google.protobuf/protobuf-java = 2.5.0
 /com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
 /com.googlecode.mp4parser/isoparser = 1.0.2
-/com.ibm.icu/icu4j = 54.1
+/com.ibm.icu/icu4j = 56.1
 /com.pff/java-libpst = 0.8.1
 /com.spatial4j/spatial4j = 0.5
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/licenses/icu4j-54.1.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-54.1.jar.sha1 b/lucene/licenses/icu4j-54.1.jar.sha1
deleted file mode 100644
index 25d6eb3..0000000
--- a/lucene/licenses/icu4j-54.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-3f66ecd5871467598bc81662817b80612a0a907f

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/licenses/icu4j-56.1.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-56.1.jar.sha1 b/lucene/licenses/icu4j-56.1.jar.sha1
new file mode 100644
index 0000000..5f8e046
--- /dev/null
+++ b/lucene/licenses/icu4j-56.1.jar.sha1
@@ -0,0 +1 @@
+8dd6671f52165a0419e6de5e1016400875a90fa9

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/solr/licenses/icu4j-54.1.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-54.1.jar.sha1 b/solr/licenses/icu4j-54.1.jar.sha1
deleted file mode 100644
index 25d6eb3..0000000
--- a/solr/licenses/icu4j-54.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-3f66ecd5871467598bc81662817b80612a0a907f

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/solr/licenses/icu4j-56.1.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-56.1.jar.sha1 b/solr/licenses/icu4j-56.1.jar.sha1
new file mode 100644
index 0000000..5f8e046
--- /dev/null
+++ b/solr/licenses/icu4j-56.1.jar.sha1
@@ -0,0 +1 @@
+8dd6671f52165a0419e6de5e1016400875a90fa9