You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2020/02/20 18:01:07 UTC

[lucene-solr] branch master updated: LUCENE-9155: Port Kuromoji dictionary compilation (regenerate).

This is an automated email from the ASF dual-hosted git repository.

dweiss pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 62662e4  LUCENE-9155: Port Kuromoji dictionary compilation (regenerate).
62662e4 is described below

commit 62662e477afb8a01ff3061115add660d465ae62f
Author: Dawid Weiss <dw...@apache.org>
AuthorDate: Thu Feb 20 19:00:56 2020 +0100

    LUCENE-9155: Port Kuromoji dictionary compilation (regenerate).
---
 build.gradle                      |   1 +
 gradle/generation/kuromoji.gradle | 107 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

diff --git a/build.gradle b/build.gradle
index cff33c7..1bbf1a2 100644
--- a/build.gradle
+++ b/build.gradle
@@ -93,6 +93,7 @@ apply from: file('gradle/generation/jflex.gradle')
 apply from: file('gradle/generation/javacc.gradle')
 apply from: file('gradle/generation/util.gradle')
 apply from: file('gradle/generation/snowball.gradle')
+apply from: file('gradle/generation/kuromoji.gradle')
 
 // Additional development aids.
 apply from: file('gradle/maven/maven-local.gradle')
diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle
new file mode 100644
index 0000000..ad1aba7
--- /dev/null
+++ b/gradle/generation/kuromoji.gradle
@@ -0,0 +1,107 @@
+
+// This downloads and compiles Kuromoji dictionaries.
+
+configure(project(":lucene:analysis:kuromoji")) {
+  apply plugin: 'java-library'
+  apply plugin: "de.undercouch.download"
+
+  ext {
+    targetDir = file("src/resources")
+  }
+
+  task compileMecabIpadic(type: Download) {
+    description "Recompile mecab dictionaries."
+    group "generation"
+
+    dependsOn sourceSets.main.runtimeClasspath
+
+    def dictionaryName = "mecab-ipadic-2.7.0-20070801"
+    def dictionarySource = "https://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/${dictionaryName}.tar.gz"
+    def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.tar.gz")
+    def unpackedDir = file("${buildDir}/generate/${dictionaryName}")
+
+    src dictionarySource
+    dest dictionaryFile
+    onlyIfModified true
+
+    doLast {
+      // Unpack the downloaded archive.
+      delete unpackedDir
+      ant.untar(src: dictionaryFile, dest: unpackedDir, compression: "gzip") {
+        ant.cutdirsmapper(dirs: "1")
+      }
+
+      // Apply patch via local git.
+      project.exec {
+        workingDir = unpackedDir
+        executable "git"
+        args += [
+            "apply",
+            file("src/tools/patches/Noun.proper.csv.patch").absolutePath
+        ]
+      }
+
+      // Compile the dictionary
+      project.javaexec {
+        main = "org.apache.lucene.analysis.ja.util.DictionaryBuilder"
+        classpath = sourceSets.main.runtimeClasspath
+
+        jvmArgs '-Xmx1G'
+
+        args += [
+            "ipadic",
+            unpackedDir,
+            targetDir,
+            "euc-jp",
+            false
+        ]
+
+        logger.lifecycle("Automaton regenerated from dictionary: ${dictionaryName}")
+      }
+    }
+  }
+
+  /*
+  TODO: this currently doesn't work because DictionaryBuilder no longer supports this type?
+
+  task compileNaist(type: Download) {
+    description "Recompile naist dictionaries."
+    group "generation"
+
+    dependsOn sourceSets.main.runtimeClasspath
+
+    def dictionaryName = "mecab-naist-jdic-0.6.3b-20111013"
+    def dictionarySource = "https://rwthaachen.dl.osdn.jp/naist-jdic/53500/${dictionaryName}.tar.gz"
+    def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.tar.gz")
+    def unpackedDir = file("${buildDir}/generate/${dictionaryName}")
+
+    src dictionarySource
+    dest dictionaryFile
+    onlyIfModified true
+
+    doLast {
+      // Unpack the downloaded archive.
+      delete unpackedDir
+      ant.untar(src: dictionaryFile, dest: unpackedDir, compression: "gzip") {
+        ant.cutdirsmapper(dirs: "1")
+      }
+
+      // Compile the dictionary
+      project.javaexec {
+        main = "org.apache.lucene.analysis.ja.util.DictionaryBuilder"
+        classpath = sourceSets.main.runtimeClasspath
+
+        jvmArgs '-Xmx1G'
+
+        args += [
+            "naist",
+            unpackedDir,
+            targetDir,
+            "euc-jp",
+            false
+        ]
+      }
+    }
+  }
+   */
+}