You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2022/04/07 12:29:48 UTC

[lucene] branch main updated: LUCENE-10493: Unify TokenInfoFST in kuromoji and nori (#795)

This is an automated email from the ASF dual-hosted git repository.

tomoko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 9aa8ec9d06a LUCENE-10493: Unify TokenInfoFST in kuromoji and nori (#795)
9aa8ec9d06a is described below

commit 9aa8ec9d06a2b271559ec0a93e1405239bbb6af2
Author: Tomoko Uchida <to...@gmail.com>
AuthorDate: Thu Apr 7 21:29:44 2022 +0900

    LUCENE-10493: Unify TokenInfoFST in kuromoji and nori (#795)
---
 .../lucene/analysis/morph}/TokenInfoFST.java       | 57 ++++++++++++----------
 .../lucene/analysis/ja/dict/TokenInfoFST.java      | 56 +--------------------
 .../lucene/analysis/ko/dict/TokenInfoFST.java      | 55 +--------------------
 3 files changed, 36 insertions(+), 132 deletions(-)

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/TokenInfoFST.java
similarity index 53%
copy from lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
copy to lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/TokenInfoFST.java
index 6c05d01e224..460118f712a 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/TokenInfoFST.java
@@ -14,50 +14,62 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.analysis.ko.dict;
+package org.apache.lucene.analysis.morph;
 
 import java.io.IOException;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FST.Arc;
 
-/** Thin wrapper around an FST with root-arc caching for Hangul syllables (11,172 arcs). */
-public final class TokenInfoFST {
-  private final FST<Long> fst;
-
+/**
+ * Thin wrapper around an FST with root-arc caching.
+ *
+ * <p>Root arcs between <code>cacheFloor</code> and <code>cacheFloor</code> are cached.
+ */
+public abstract class TokenInfoFST {
+  protected final FST<Long> fst;
   private final int cacheCeiling;
-  private final FST.Arc<Long>[] rootCache;
+  private final int cacheFloor;
+  private final Arc<Long>[] rootCache;
 
   public final Long NO_OUTPUT;
 
-  public TokenInfoFST(FST<Long> fst) throws IOException {
+  protected TokenInfoFST(FST<Long> fst, int cacheCeiling, int cacheFloor) throws IOException {
+    if (cacheCeiling < cacheFloor) {
+      throw new IllegalArgumentException(
+          "cacheCeiling must be larger than cacheFloor; cacheCeiling="
+              + cacheCeiling
+              + ", cacheFloor="
+              + cacheFloor);
+    }
     this.fst = fst;
-    this.cacheCeiling = 0xD7A3;
+    this.cacheCeiling = cacheCeiling;
+    this.cacheFloor = cacheFloor;
     NO_OUTPUT = fst.outputs.getNoOutput();
     rootCache = cacheRootArcs();
   }
 
   @SuppressWarnings({"rawtypes", "unchecked"})
-  private FST.Arc<Long>[] cacheRootArcs() throws IOException {
-    FST.Arc<Long>[] rootCache = new FST.Arc[1 + (cacheCeiling - 0xAC00)];
-    FST.Arc<Long> firstArc = new FST.Arc<>();
+  private Arc<Long>[] cacheRootArcs() throws IOException {
+    Arc<Long>[] rootCache = new Arc[1 + (cacheCeiling - cacheFloor)];
+    Arc<Long> firstArc = new Arc<>();
     fst.getFirstArc(firstArc);
-    FST.Arc<Long> arc = new FST.Arc<>();
+    Arc<Long> arc = new Arc<>();
     final FST.BytesReader fstReader = fst.getBytesReader();
-    // TODO: jump to AC00, readNextRealArc to ceiling? (just be careful we don't add bugs)
+    // TODO: jump to cacheFloor, readNextRealArc to ceiling? (just be careful we don't add bugs)
     for (int i = 0; i < rootCache.length; i++) {
-      if (fst.findTargetArc(0xAC00 + i, firstArc, arc, fstReader) != null) {
-        rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
+      if (fst.findTargetArc(cacheFloor + i, firstArc, arc, fstReader) != null) {
+        rootCache[i] = new Arc<Long>().copyFrom(arc);
       }
     }
     return rootCache;
   }
 
-  public FST.Arc<Long> findTargetArc(
-      int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader)
+  public Arc<Long> findTargetArc(
+      int ch, Arc<Long> follow, Arc<Long> arc, boolean useCache, FST.BytesReader fstReader)
       throws IOException {
-    if (useCache && ch >= 0xAC00 && ch <= cacheCeiling) {
+    if (useCache && ch >= cacheFloor && ch <= cacheCeiling) {
       assert ch != FST.END_LABEL;
-      final Arc<Long> result = rootCache[ch - 0xAC00];
+      final Arc<Long> result = rootCache[ch - cacheFloor];
       if (result == null) {
         return null;
       } else {
@@ -69,16 +81,11 @@ public final class TokenInfoFST {
     }
   }
 
-  public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
+  public Arc<Long> getFirstArc(Arc<Long> arc) {
     return fst.getFirstArc(arc);
   }
 
   public FST.BytesReader getBytesReader() {
     return fst.getBytesReader();
   }
-
-  /** @lucene.internal for testing only */
-  FST<Long> getInternalFST() {
-    return fst;
-  }
 }
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java
index 2c6a5c6ecad..23861585f4e 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.ja.dict;
 
 import java.io.IOException;
 import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.FST.Arc;
 
 /**
  * Thin wrapper around an FST with root-arc caching for Japanese.
@@ -26,64 +25,13 @@ import org.apache.lucene.util.fst.FST.Arc;
  * <p>Depending upon fasterButMoreRam, either just kana (191 arcs), or kana and han (28,607 arcs)
  * are cached. The latter offers additional performance at the cost of more RAM.
  */
-public final class TokenInfoFST {
-  private final FST<Long> fst;
-
+public final class TokenInfoFST extends org.apache.lucene.analysis.morph.TokenInfoFST {
   // depending upon fasterButMoreRam, we cache root arcs for either
   // kana (0x3040-0x30FF) or kana + han (0x3040-0x9FFF)
   // false: 191 arcs
   // true:  28,607 arcs (costs ~1.5MB)
-  private final int cacheCeiling;
-  private final FST.Arc<Long>[] rootCache;
-
-  public final Long NO_OUTPUT;
-
   public TokenInfoFST(FST<Long> fst, boolean fasterButMoreRam) throws IOException {
-    this.fst = fst;
-    this.cacheCeiling = fasterButMoreRam ? 0x9FFF : 0x30FF;
-    NO_OUTPUT = fst.outputs.getNoOutput();
-    rootCache = cacheRootArcs();
-  }
-
-  @SuppressWarnings({"rawtypes", "unchecked"})
-  private FST.Arc<Long>[] cacheRootArcs() throws IOException {
-    FST.Arc<Long>[] rootCache = new FST.Arc[1 + (cacheCeiling - 0x3040)];
-    FST.Arc<Long> firstArc = new FST.Arc<>();
-    fst.getFirstArc(firstArc);
-    FST.Arc<Long> arc = new FST.Arc<>();
-    final FST.BytesReader fstReader = fst.getBytesReader();
-    // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
-    for (int i = 0; i < rootCache.length; i++) {
-      if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
-        rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
-      }
-    }
-    return rootCache;
-  }
-
-  public FST.Arc<Long> findTargetArc(
-      int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader)
-      throws IOException {
-    if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
-      assert ch != FST.END_LABEL;
-      final Arc<Long> result = rootCache[ch - 0x3040];
-      if (result == null) {
-        return null;
-      } else {
-        arc.copyFrom(result);
-        return arc;
-      }
-    } else {
-      return fst.findTargetArc(ch, follow, arc, fstReader);
-    }
-  }
-
-  public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
-    return fst.getFirstArc(arc);
-  }
-
-  public FST.BytesReader getBytesReader() {
-    return fst.getBytesReader();
+    super(fst, fasterButMoreRam ? 0x9FFF : 0x30FF, 0x3040);
   }
 
   /** @lucene.internal for testing only */
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
index 6c05d01e224..806f1209772 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
@@ -18,63 +18,12 @@ package org.apache.lucene.analysis.ko.dict;
 
 import java.io.IOException;
 import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.FST.Arc;
 
 /** Thin wrapper around an FST with root-arc caching for Hangul syllables (11,172 arcs). */
-public final class TokenInfoFST {
-  private final FST<Long> fst;
-
-  private final int cacheCeiling;
-  private final FST.Arc<Long>[] rootCache;
-
-  public final Long NO_OUTPUT;
+public final class TokenInfoFST extends org.apache.lucene.analysis.morph.TokenInfoFST {
 
   public TokenInfoFST(FST<Long> fst) throws IOException {
-    this.fst = fst;
-    this.cacheCeiling = 0xD7A3;
-    NO_OUTPUT = fst.outputs.getNoOutput();
-    rootCache = cacheRootArcs();
-  }
-
-  @SuppressWarnings({"rawtypes", "unchecked"})
-  private FST.Arc<Long>[] cacheRootArcs() throws IOException {
-    FST.Arc<Long>[] rootCache = new FST.Arc[1 + (cacheCeiling - 0xAC00)];
-    FST.Arc<Long> firstArc = new FST.Arc<>();
-    fst.getFirstArc(firstArc);
-    FST.Arc<Long> arc = new FST.Arc<>();
-    final FST.BytesReader fstReader = fst.getBytesReader();
-    // TODO: jump to AC00, readNextRealArc to ceiling? (just be careful we don't add bugs)
-    for (int i = 0; i < rootCache.length; i++) {
-      if (fst.findTargetArc(0xAC00 + i, firstArc, arc, fstReader) != null) {
-        rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
-      }
-    }
-    return rootCache;
-  }
-
-  public FST.Arc<Long> findTargetArc(
-      int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader)
-      throws IOException {
-    if (useCache && ch >= 0xAC00 && ch <= cacheCeiling) {
-      assert ch != FST.END_LABEL;
-      final Arc<Long> result = rootCache[ch - 0xAC00];
-      if (result == null) {
-        return null;
-      } else {
-        arc.copyFrom(result);
-        return arc;
-      }
-    } else {
-      return fst.findTargetArc(ch, follow, arc, fstReader);
-    }
-  }
-
-  public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
-    return fst.getFirstArc(arc);
-  }
-
-  public FST.BytesReader getBytesReader() {
-    return fst.getBytesReader();
+    super(fst, 0xD7A3, 0xAC00);
   }
 
   /** @lucene.internal for testing only */