You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2016/02/18 02:10:13 UTC
lucene-solr git commit: LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.
Repository: lucene-solr
Updated Branches:
refs/heads/master 31437c9b4 -> b0a43aa1b
LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b0a43aa1
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b0a43aa1
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b0a43aa1
Branch: refs/heads/master
Commit: b0a43aa1b2819133ec2ee69545a62358baf440b3
Parents: 31437c9
Author: Robert Muir <rm...@apache.org>
Authored: Wed Feb 17 20:01:27 2016 -0500
Committer: Robert Muir <rm...@apache.org>
Committed: Wed Feb 17 20:10:02 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 ++
lucene/analysis/icu/src/data/uax29/Khmer.rbbi | 61 -------------------
.../icu/src/data/utr30/DiacriticFolding.txt | 6 +-
.../icu/src/data/utr30/NativeDigitFolding.txt | 10 +++
.../segmentation/DefaultICUTokenizerConfig.java | 7 +--
lucene/analysis/icu/src/java/overview.html | 2 +-
.../analysis/icu/segmentation/Default.brk | Bin 34320 -> 35264 bytes
.../lucene/analysis/icu/segmentation/Khmer.brk | Bin 17296 -> 0 bytes
.../org/apache/lucene/analysis/icu/utr30.nrm | Bin 53728 -> 53840 bytes
.../icu/segmentation/TestICUTokenizer.java | 3 +
lucene/ivy-versions.properties | 2 +-
lucene/licenses/icu4j-54.1.jar.sha1 | 1 -
lucene/licenses/icu4j-56.1.jar.sha1 | 1 +
solr/licenses/icu4j-54.1.jar.sha1 | 1 -
solr/licenses/icu4j-56.1.jar.sha1 | 1 +
15 files changed, 26 insertions(+), 73 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 24632de..eea3a1b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -127,6 +127,10 @@ Tests
expression to encapsulate a statement that is expected to throw an exception.
(Ryan Ernst)
+Other
+
+* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
+
======================= Lucene 5.6.0 =======================
(No Changes)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/uax29/Khmer.rbbi b/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
deleted file mode 100644
index 43be268..0000000
--- a/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
+++ /dev/null
@@ -1,61 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# Parses Khmer text, with orthographic syllable as token.
-#
-# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
-#
-# B = base character (consonant, independent vowel, etc)
-$KhmerBase = [\u1780-\u17B3];
-# R = robat
-$KhmerRobat = [\u17CC];
-# C = consonant shifter
-$KhmerShifter = [\u17C9\u17CA];
-# S = subscript consonant or independent vowel sign
-$KhmerSub = ([\u17D2] $KhmerBase);
-# V = dependent vowel sign
-$KhmerVowel = [\u17B4-\u17C5];
-# Z = zero-width joiner or non-joiner
-$KhmerZWC = [\u200C\u200D];
-# O = any other sign
-$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
-
-$WordJoin = [:Line_Break=Word_Joiner:];
-
-$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
-
-$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
-
-#
-# default numerical definitions
-#
-$Extend = [\p{Word_Break = Extend}];
-$Format = [\p{Word_Break = Format}];
-$MidNumLet = [\p{Word_Break = MidNumLet}];
-$MidNum = [\p{Word_Break = MidNum}];
-$Numeric = [\p{Word_Break = Numeric}];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
-$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
-$MidNumEx = $MidNum ($Extend | $Format)*;
-$NumericEx = $Numeric ($Extend | $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
-
-!!forward;
-$KhmerJoinedSyllableEx {200};
-
-# default numeric rules
-$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
index 9830754..3772daf 100644
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@@ -62,7 +62,7 @@
07A6..07B0>
07EB..07F5>
0818..0819>
-08E4..08FE>
+08E3..08FE>
093C>
094D>
0951..0954>
@@ -149,7 +149,7 @@ AAF6>
AB5B..AB5F>
ABEC..ABED>
FB1E>
-FE20..FE2D>
+FE20..FE2F>
FF3E>
FF40>
FF70>
@@ -161,6 +161,7 @@ FFE3>
11133..11134>
11173>
111C0>
+111CA..111CC>
11235..11236>
112E9..112EA>
1133C>
@@ -171,6 +172,7 @@ FFE3>
115BF..115C0>
1163F>
116B6..116B7>
+1172B>
16AF0..16AF4>
16F8F..16F9F>
1D167..1D169>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
index 54cd114..62e6aef 100644
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@@ -540,6 +540,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
116C7>0037 # TAKRI DIGIT SEVEN
116C8>0038 # TAKRI DIGIT EIGHT
116C9>0039 # TAKRI DIGIT NINE
+11730>0030 # AHOM DIGIT ZERO
+11731>0031 # AHOM DIGIT ONE
+11732>0032 # AHOM DIGIT TWO
+11733>0033 # AHOM DIGIT THREE
+11734>0034 # AHOM DIGIT FOUR
+11735>0035 # AHOM DIGIT FIVE
+11736>0036 # AHOM DIGIT SIX
+11737>0037 # AHOM DIGIT SEVEN
+11738>0038 # AHOM DIGIT EIGHT
+11739>0039 # AHOM DIGIT NINE
118E0>0030 # WARANG CITI DIGIT ZERO
118E1>0031 # WARANG CITI DIGIT ONE
118E2>0032 # WARANG CITI DIGIT TWO
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
index dbf9b2e..b33663d 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@@ -35,9 +35,7 @@ import com.ibm.icu.util.ULocale;
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings:
* <ul>
- * <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary.
- * <li>Khmer text is broken into syllables
- * based on custom BreakIterator rules.
+ * <li>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.
* </ul>
* @lucene.experimental
*/
@@ -65,8 +63,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
// the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator =
readBreakIterator("Default.brk");
- private static final BreakIterator khmerBreakIterator =
- readBreakIterator("Khmer.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
private final boolean cjkAsWords;
@@ -91,7 +87,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
@Override
public BreakIterator getBreakIterator(int script) {
switch(script) {
- case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
default: return (BreakIterator)defaultBreakIterator.clone();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html
index 5a836d9..abb2e2a 100644
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 6.3). However, some users who wish
+the most recent version of Unicode (currently 8.0). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
index af2727c..5b84797 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
deleted file mode 100644
index dc19835..0000000
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk and /dev/null differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
index cb0d934..2680264 100644
Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
index f60954f..6398b2c 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@@ -129,6 +129,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
}
+ public void testKhmer() throws Exception {
+ assertAnalyzesTo(a, "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ", new String[] { "ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", "ខ្នង", "នេះ" });
+ }
public void testLao() throws Exception {
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/ivy-versions.properties
----------------------------------------------------------------------
diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
index 5bb6d04..1f832e9 100644
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@@ -39,7 +39,7 @@ com.google.inject.guice.version = 3.0
/com.google.protobuf/protobuf-java = 2.5.0
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
/com.googlecode.mp4parser/isoparser = 1.0.2
-/com.ibm.icu/icu4j = 54.1
+/com.ibm.icu/icu4j = 56.1
/com.pff/java-libpst = 0.8.1
/com.spatial4j/spatial4j = 0.5
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/licenses/icu4j-54.1.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-54.1.jar.sha1 b/lucene/licenses/icu4j-54.1.jar.sha1
deleted file mode 100644
index 25d6eb3..0000000
--- a/lucene/licenses/icu4j-54.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-3f66ecd5871467598bc81662817b80612a0a907f
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/lucene/licenses/icu4j-56.1.jar.sha1
----------------------------------------------------------------------
diff --git a/lucene/licenses/icu4j-56.1.jar.sha1 b/lucene/licenses/icu4j-56.1.jar.sha1
new file mode 100644
index 0000000..5f8e046
--- /dev/null
+++ b/lucene/licenses/icu4j-56.1.jar.sha1
@@ -0,0 +1 @@
+8dd6671f52165a0419e6de5e1016400875a90fa9
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/solr/licenses/icu4j-54.1.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-54.1.jar.sha1 b/solr/licenses/icu4j-54.1.jar.sha1
deleted file mode 100644
index 25d6eb3..0000000
--- a/solr/licenses/icu4j-54.1.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-3f66ecd5871467598bc81662817b80612a0a907f
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b0a43aa1/solr/licenses/icu4j-56.1.jar.sha1
----------------------------------------------------------------------
diff --git a/solr/licenses/icu4j-56.1.jar.sha1 b/solr/licenses/icu4j-56.1.jar.sha1
new file mode 100644
index 0000000..5f8e046
--- /dev/null
+++ b/solr/licenses/icu4j-56.1.jar.sha1
@@ -0,0 +1 @@
+8dd6671f52165a0419e6de5e1016400875a90fa9