You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2019/04/13 12:09:10 UTC
[lucene-solr] branch master updated: LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer

This is an automated email from the ASF dual-hosted git repository.

tomoko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 7619c07  LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer
7619c07 is described below

commit 7619c07d3a80bb781f688c2cbbff33024142670a
Author: Tomoko Uchida <to...@apache.org>
AuthorDate: Sat Apr 13 21:07:27 2019 +0900

    LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer
---
 lucene/CHANGES.txt                                 |   6 +++++-
 lucene/analysis/kuromoji/build.xml                 |   7 ++++++-
 .../ja/dict/TokenInfoDictionary$buffer.dat         | Bin 4337216 -> 4337224 bytes
 .../analysis/ja/dict/TokenInfoDictionary$fst.dat   | Bin 1698563 -> 1698570 bytes
 .../ja/dict/TokenInfoDictionary$targetMap.dat      | Bin 392165 -> 392166 bytes
 .../lucene/analysis/ja/TestJapaneseTokenizer.java  |  12 ++++++++++++
 .../src/tools/patches/Noun.proper.csv.patch        |   7 +++++++
 7 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 63adfcb..1ecde59 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -110,7 +110,7 @@ Improvements
 
 * LUCENE-8750: Implements setMissingValue() on sort fields produced from 
   DoubleValuesSource and LongValuesSource (Mike Sokolov via Alan Woodward)
-
+  
 * LUCENE-8701: ToParentBlockJoinQuery now creates a child scorer that disallows skipping over
   non-competitive documents if the score of a parent depends on the score of multiple
   children (avg, max, min). Additionally the score mode `none` that assigns a constant score to
@@ -119,6 +119,10 @@ Improvements
 * LUCENE-8751: Weight#matches now use the ScorerSupplier to build scorers with a lead cost of 1
   (single document). (Jim Ferenczi)
 
+* LUCENE-8752: Japanese new era name '令和' (Reiwa) is added to the dictionary used in
+  JapaneseTokenizer so that the analyzer handles the era name correctly.
+  Reiwa is set to replace the Heisei Era on May 1, 2019. (Tomoko Uchida)
+
 Changes in Runtime Behavior
 
 * LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
index 0bce4b4..decfa7a 100644
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@@ -64,6 +64,11 @@
      <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
   </target>
 
+  <target name="patch-dict" depends="download-dict">
+    <patch patchfile="src/tools/patches/Noun.proper.csv.patch"
+           originalfile="${dict.src.dir}/Noun.proper.csv"/>
+  </target>
+
   <path id="tools.dependencies">
     <fileset dir="../icu/lib"/>
   </path>
@@ -81,7 +86,7 @@
     <pathelement location="${build.dir}/classes/tools-test"/>
   </path>
 
-  <target name="build-dict" depends="compile-tools, download-dict">
+  <target name="build-dict" depends="compile-tools, patch-dict">
     <sequential>
       <delete verbose="true">
         <fileset dir="${resources.dir}/org/apache/lucene/analysis/ja/dict" includes="**/*"/>
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat
index dcf430a..09f1e46 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index 6cfad72..c06fd4a 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat
index 0e27345..13d09bc 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat differ
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
index cc69840..1a478db 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@@ -836,4 +836,16 @@ public class
     tokenizer.reset();
     while (tokenizer.incrementToken());
   }
+
+  public void testPatchedSystemDict() throws Exception {
+    assertAnalyzesTo(analyzer, "令和元年",
+        new String[]{"令和", "元年"},
+        new int[]{0, 2},
+        new int[]{2, 4});
+
+    assertAnalyzesTo(analyzerNormal, "令和元年",
+        new String[]{"令和", "元年"},
+        new int[]{0, 2},
+        new int[]{2, 4});
+  }
 }
diff --git a/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch b/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch
new file mode 100644
index 0000000..ee845ab
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch
@@ -0,0 +1,7 @@
+--- Noun.proper.csv	2007-07-31 23:50:07.000000000 +0900
++++ Noun.proper.csv.20190403	2019-04-03 15:52:43.793191818 +0900
+@@ -27325,3 +27325,4 @@
+ �����ɡ,1288,1288,8538,̾��,��ͭ̾��,����,*,*,*,�����ɡ,���Υ��ϥ�,���Υ��ϥ�
+ �ɥ���,1288,1288,3765,̾��,��ͭ̾��,����,*,*,*,�ɥ���,�ɥ�����,�ɡ�����
+ �;뻳,1288,1288,8538,̾��,��ͭ̾��,����,*,*,*,�;뻳,�ȥ�����,�ȥ�����
++����,1288,1288,5904,̾��,��ͭ̾��,����,*,*,*,����,�쥤��,�쥤��