You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2019/04/13 12:09:10 UTC
[lucene-solr] branch master updated: LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer
This is an automated email from the ASF dual-hosted git repository.
tomoko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 7619c07 LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer
7619c07 is described below
commit 7619c07d3a80bb781f688c2cbbff33024142670a
Author: Tomoko Uchida <to...@apache.org>
AuthorDate: Sat Apr 13 21:07:27 2019 +0900
LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer
---
lucene/CHANGES.txt | 6 +++++-
lucene/analysis/kuromoji/build.xml | 7 ++++++-
.../ja/dict/TokenInfoDictionary$buffer.dat | Bin 4337216 -> 4337224 bytes
.../analysis/ja/dict/TokenInfoDictionary$fst.dat | Bin 1698563 -> 1698570 bytes
.../ja/dict/TokenInfoDictionary$targetMap.dat | Bin 392165 -> 392166 bytes
.../lucene/analysis/ja/TestJapaneseTokenizer.java | 12 ++++++++++++
.../src/tools/patches/Noun.proper.csv.patch | 7 +++++++
7 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 63adfcb..1ecde59 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -110,7 +110,7 @@ Improvements
* LUCENE-8750: Implements setMissingValue() on sort fields produced from
DoubleValuesSource and LongValuesSource (Mike Sokolov via Alan Woodward)
-
+
* LUCENE-8701: ToParentBlockJoinQuery now creates a child scorer that disallows skipping over
non-competitive documents if the score of a parent depends on the score of multiple
children (avg, max, min). Additionally the score mode `none` that assigns a constant score to
@@ -119,6 +119,10 @@ Improvements
* LUCENE-8751: Weight#matches now use the ScorerSupplier to build scorers with a lead cost of 1
(single document). (Jim Ferenczi)
+* LUCENE-8752: Japanese new era name '令和' (Reiwa) is added to the dictionary used in
+ JapaneseTokenizer so that the analyzer handles the era name correctly.
+ Reiwa is set to replace the Heisei Era on May 1, 2019. (Tomoko Uchida)
+
Changes in Runtime Behavior
* LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
index 0bce4b4..decfa7a 100644
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@@ -64,6 +64,11 @@
<untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
</target>
+ <target name="patch-dict" depends="download-dict">
+ <patch patchfile="src/tools/patches/Noun.proper.csv.patch"
+ originalfile="${dict.src.dir}/Noun.proper.csv"/>
+ </target>
+
<path id="tools.dependencies">
<fileset dir="../icu/lib"/>
</path>
@@ -81,7 +86,7 @@
<pathelement location="${build.dir}/classes/tools-test"/>
</path>
- <target name="build-dict" depends="compile-tools, download-dict">
+ <target name="build-dict" depends="compile-tools, patch-dict">
<sequential>
<delete verbose="true">
<fileset dir="${resources.dir}/org/apache/lucene/analysis/ja/dict" includes="**/*"/>
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat
index dcf430a..09f1e46 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index 6cfad72..c06fd4a 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat
index 0e27345..13d09bc 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat differ
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
index cc69840..1a478db 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@@ -836,4 +836,16 @@ public class
tokenizer.reset();
while (tokenizer.incrementToken());
}
+
+ public void testPatchedSystemDict() throws Exception {
+ assertAnalyzesTo(analyzer, "令和元年",
+ new String[]{"令和", "元年"},
+ new int[]{0, 2},
+ new int[]{2, 4});
+
+ assertAnalyzesTo(analyzerNormal, "令和元年",
+ new String[]{"令和", "元年"},
+ new int[]{0, 2},
+ new int[]{2, 4});
+ }
}
diff --git a/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch b/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch
new file mode 100644
index 0000000..ee845ab
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch
@@ -0,0 +1,7 @@
+--- Noun.proper.csv 2007-07-31 23:50:07.000000000 +0900
++++ Noun.proper.csv.20190403 2019-04-03 15:52:43.793191818 +0900
+@@ -27325,3 +27325,4 @@
+ �����ɡ,1288,1288,8538,̾��,��ͭ̾��,����,*,*,*,�����ɡ,���Υ��ϥ�,���Υ��ϥ�
+ �ɥ���,1288,1288,3765,̾��,��ͭ̾��,����,*,*,*,�ɥ���,�ɥ�����,�ɡ�����
+ �;뻳,1288,1288,8538,̾��,��ͭ̾��,����,*,*,*,�;뻳,�ȥ�����,�ȥ�����
++����,1288,1288,5904,̾��,��ͭ̾��,����,*,*,*,����,�쥤��,�쥤��