You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/07/14 01:26:48 UTC
[lucene] branch main updated: LUCENE-9177: ICUNormalizer2CharFilter
streaming no longer depends on presence of normalization-inert characters
(#199)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new c3482c9 LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)
c3482c9 is described below
commit c3482c99ffd9b30acb423e63760ebc7baab9dd26
Author: Michael Gibney <mi...@michaelgibney.net>
AuthorDate: Tue Jul 13 21:26:40 2021 -0400
LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)
Normalization-inert characters need not be required as boundaries
for incremental processing. It is sufficient to check `hasBoundaryAfter`
and `hasBoundaryBefore`, substantially improving worst-case performance.
---
lucene/CHANGES.txt | 4 +
.../analysis/icu/ICUNormalizer2CharFilter.java | 48 ++++++-----
.../analysis/icu/TestICUNormalizer2CharFilter.java | 98 ++++++++++++++++++++++
3 files changed, 130 insertions(+), 20 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 59f5a72..41fe5f0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -378,6 +378,10 @@ Improvements
* LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
them to provide an ExecutorService. (Greg Miller)
+* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
+ characters as boundaries for incremental processing, vastly improving worst-case
+ performance. (Michael Gibney)
+
Optimizations
---------------------
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
index 2992a99..dfb4989 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
@@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
private final CharacterUtils.CharacterBuffer tmpBuffer;
private void readInputToBuffer() throws IOException {
- while (true) {
- // CharacterUtils.fill is supplementary char aware
- final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
-
- assert tmpBuffer.getOffset() == 0;
- inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
-
- if (hasRemainingChars == false) {
- inputFinished = true;
- break;
- }
-
- final int lastCodePoint =
- Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
- if (normalizer.isInert(lastCodePoint)) {
- // we require an inert char so that we can normalize content before and
- // after this character independently
- break;
- }
+ // CharacterUtils.fill is supplementary char aware
+ if (!CharacterUtils.fill(tmpBuffer, input)) {
+ inputFinished = true;
}
+ assert tmpBuffer.getOffset() == 0;
+ inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
+
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
}
@@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
}
if (!afterQuickCheckYes) {
int resLen = readFromInputWhileSpanQuickCheckYes();
- afterQuickCheckYes = true;
if (resLen > 0) return resLen;
}
int resLen = readFromIoNormalizeUptoBoundary();
@@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
}
private int readFromInputWhileSpanQuickCheckYes() {
+ afterQuickCheckYes = true;
int end = normalizer.spanQuickCheckYes(inputBuffer);
if (end > 0) {
+ int cp;
+ if (end == inputBuffer.length()
+ && !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
+ /*
+ our quickCheckYes result is valid thru the end of current buffer, but we need to back off
+ because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
+ canonical ordering of combining characters across the buffer boundary.
+ */
+ afterQuickCheckYes = false;
+ end -= Character.charCount(cp);
+ // NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
+ // `Normalizer2.hasBoundaryAfter()`:
+ // "Note that this operation may be significantly slower than hasBoundaryBefore()"
+ while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
+ cp = inputBuffer.codePointBefore(end);
+ end -= Character.charCount(cp);
+ }
+ if (end == 0) {
+ return 0;
+ }
+ }
resultBuffer.append(inputBuffer.subSequence(0, end));
inputBuffer.delete(0, end);
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
index bb1240c..79b1dfe 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
@@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
input.length());
}
+ /**
+ * A wrapping reader that provides transparency wrt how much of the underlying stream has been
+ * consumed. This may be used to validate incremental behavior in upstream reads.
+ */
+ private static class TransparentReader extends Reader {
+
+ private final int expectSize;
+ private final Reader backing;
+ private int cursorPosition = -1;
+ private boolean finished = false;
+
+ private TransparentReader(Reader backing, int expectSize) {
+ this.expectSize = expectSize;
+ this.backing = backing;
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ final int ret = backing.read(cbuf, off, len);
+ if (cursorPosition == -1) {
+ if (ret != -1) {
+ cursorPosition = ret;
+ } else {
+ cursorPosition = 0;
+ }
+ } else if (ret == -1) {
+ assert finished;
+ } else {
+ cursorPosition += ret;
+ }
+ if (!finished && cursorPosition >= expectSize) {
+ finished = true;
+ }
+ return ret;
+ }
+
+ @Override
+ public void close() throws IOException {
+ assert finished && cursorPosition != -1;
+ backing.close();
+ }
+ }
+
+ public void testIncrementalNoInertChars() throws Exception {
+ final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
+ final int inputLength = input.length();
+ final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
+ Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+
+ // Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
+ for (int i = inputLength - 1; i >= 0; i--) {
+ assertFalse(norm.isInert(Character.codePointAt(input, i)));
+ }
+
+ final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
+ char[] tempBuff = new char[outerBufferSize];
+
+ /*
+ Sanity check that with the default inner buffer size, the upstream input is read in its
+ entirety after the first external read.
+ */
+ TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
+ CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
+ assertTrue(reader.read(tempBuff) > 0);
+ assertEquals(inputLength, tr.cursorPosition);
+ assertTrue(tr.finished);
+
+ /*
+ By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
+ even for input with no inert chars, we're able to read input incrementally. To support incremental
+ upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
+ */
+ final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
+ tr = new TransparentReader(new StringReader(input), inputLength);
+ reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
+ StringBuilder result = new StringBuilder();
+ int incrementalReads = 0;
+ int finishedReads = 0;
+ while (true) {
+ int length = reader.read(tempBuff);
+ if (length == -1) {
+ assertTrue(tr.finished);
+ break;
+ }
+ result.append(tempBuff, 0, length);
+ if (length > 0) {
+ if (!tr.finished) {
+ incrementalReads++;
+ assertTrue(tr.cursorPosition < inputLength);
+ } else {
+ finishedReads++;
+ }
+ }
+ }
+ assertTrue(incrementalReads > finishedReads);
+ assertEquals(expectOutput, result.toString());
+ }
+
public void testMassiveLigature() throws IOException {
String input = "\uFDFA";