You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/07/14 01:26:48 UTC

[lucene] branch main updated: LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new c3482c9  LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)
c3482c9 is described below

commit c3482c99ffd9b30acb423e63760ebc7baab9dd26
Author: Michael Gibney <mi...@michaelgibney.net>
AuthorDate: Tue Jul 13 21:26:40 2021 -0400

    LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)
    
    Normalization-inert characters need not be required as boundaries
    for incremental processing. It is sufficient to check `hasBoundaryAfter`
    and `hasBoundaryBefore`, substantially improving worst-case performance.
---
 lucene/CHANGES.txt                                 |  4 +
 .../analysis/icu/ICUNormalizer2CharFilter.java     | 48 ++++++-----
 .../analysis/icu/TestICUNormalizer2CharFilter.java | 98 ++++++++++++++++++++++
 3 files changed, 130 insertions(+), 20 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 59f5a72..41fe5f0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -378,6 +378,10 @@ Improvements
 * LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
   them to provide an ExecutorService. (Greg Miller)
 
+* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
+  characters as boundaries for incremental processing, vastly improving worst-case
+  performance. (Michael Gibney)
+
 Optimizations
 ---------------------
 * LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
index 2992a99..dfb4989 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
@@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
   private final CharacterUtils.CharacterBuffer tmpBuffer;
 
   private void readInputToBuffer() throws IOException {
-    while (true) {
-      // CharacterUtils.fill is supplementary char aware
-      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
-
-      assert tmpBuffer.getOffset() == 0;
-      inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
-
-      if (hasRemainingChars == false) {
-        inputFinished = true;
-        break;
-      }
-
-      final int lastCodePoint =
-          Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
-      if (normalizer.isInert(lastCodePoint)) {
-        // we require an inert char so that we can normalize content before and
-        // after this character independently
-        break;
-      }
+    // CharacterUtils.fill is supplementary char aware
+    if (!CharacterUtils.fill(tmpBuffer, input)) {
+      inputFinished = true;
     }
 
+    assert tmpBuffer.getOffset() == 0;
+    inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
+
     // if checkedInputBoundary was at the end of a buffer, we need to check that char again
     checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
   }
@@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
     }
     if (!afterQuickCheckYes) {
       int resLen = readFromInputWhileSpanQuickCheckYes();
-      afterQuickCheckYes = true;
       if (resLen > 0) return resLen;
     }
     int resLen = readFromIoNormalizeUptoBoundary();
@@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
   }
 
   private int readFromInputWhileSpanQuickCheckYes() {
+    afterQuickCheckYes = true;
     int end = normalizer.spanQuickCheckYes(inputBuffer);
     if (end > 0) {
+      int cp;
+      if (end == inputBuffer.length()
+          && !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
+        /*
+        our quickCheckYes result is valid thru the end of current buffer, but we need to back off
+        because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
+        canonical ordering of combining characters across the buffer boundary.
+         */
+        afterQuickCheckYes = false;
+        end -= Character.charCount(cp);
+        // NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
+        // `Normalizer2.hasBoundaryAfter()`:
+        // "Note that this operation may be significantly slower than hasBoundaryBefore()"
+        while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
+          cp = inputBuffer.codePointBefore(end);
+          end -= Character.charCount(cp);
+        }
+        if (end == 0) {
+          return 0;
+        }
+      }
       resultBuffer.append(inputBuffer.subSequence(0, end));
       inputBuffer.delete(0, end);
       checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
index bb1240c..79b1dfe 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
@@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
         input.length());
   }
 
+  /**
+   * A wrapping reader that provides transparency wrt how much of the underlying stream has been
+   * consumed. This may be used to validate incremental behavior in upstream reads.
+   */
+  private static class TransparentReader extends Reader {
+
+    private final int expectSize;
+    private final Reader backing;
+    private int cursorPosition = -1;
+    private boolean finished = false;
+
+    private TransparentReader(Reader backing, int expectSize) {
+      this.expectSize = expectSize;
+      this.backing = backing;
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      final int ret = backing.read(cbuf, off, len);
+      if (cursorPosition == -1) {
+        if (ret != -1) {
+          cursorPosition = ret;
+        } else {
+          cursorPosition = 0;
+        }
+      } else if (ret == -1) {
+        assert finished;
+      } else {
+        cursorPosition += ret;
+      }
+      if (!finished && cursorPosition >= expectSize) {
+        finished = true;
+      }
+      return ret;
+    }
+
+    @Override
+    public void close() throws IOException {
+      assert finished && cursorPosition != -1;
+      backing.close();
+    }
+  }
+
+  public void testIncrementalNoInertChars() throws Exception {
+    final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
+    final int inputLength = input.length();
+    final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
+    Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+
+    // Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
+    for (int i = inputLength - 1; i >= 0; i--) {
+      assertFalse(norm.isInert(Character.codePointAt(input, i)));
+    }
+
+    final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
+    char[] tempBuff = new char[outerBufferSize];
+
+    /*
+    Sanity check that with the default inner buffer size, the upstream input is read in its
+    entirety after the first external read.
+     */
+    TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
+    CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
+    assertTrue(reader.read(tempBuff) > 0);
+    assertEquals(inputLength, tr.cursorPosition);
+    assertTrue(tr.finished);
+
+    /*
+    By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
+    even for input with no inert chars, we're able to read input incrementally. To support incremental
+    upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
+     */
+    final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
+    tr = new TransparentReader(new StringReader(input), inputLength);
+    reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
+    StringBuilder result = new StringBuilder();
+    int incrementalReads = 0;
+    int finishedReads = 0;
+    while (true) {
+      int length = reader.read(tempBuff);
+      if (length == -1) {
+        assertTrue(tr.finished);
+        break;
+      }
+      result.append(tempBuff, 0, length);
+      if (length > 0) {
+        if (!tr.finished) {
+          incrementalReads++;
+          assertTrue(tr.cursorPosition < inputLength);
+        } else {
+          finishedReads++;
+        }
+      }
+    }
+    assertTrue(incrementalReads > finishedReads);
+    assertEquals(expectOutput, result.toString());
+  }
+
   public void testMassiveLigature() throws IOException {
     String input = "\uFDFA";