You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2020/05/04 17:32:01 UTC

[lucene-solr] branch branch_8x updated: LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new eec79e0  LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character
eec79e0 is described below

commit eec79e0b2be7f1198d20c5d24e5a99d456d7b05c
Author: Mike McCandless <mi...@apache.org>
AuthorDate: Mon May 4 13:29:00 2020 -0400

    LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character
---
 .../src/java/org/apache/lucene/util/LineFileDocs.java   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java b/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java
index fa409d1..6d0c4bf 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java
@@ -88,7 +88,6 @@ public class LineFileDocs implements Closeable {
 
     // true if the InputStream is not already randomly seek'd after the if/else block below:
     boolean needSkip;
-    boolean skipFirstLineFragment = false;
     
     long size = 0L, seekTo = 0L;
     if (is == null) {
@@ -109,8 +108,15 @@ public class LineFileDocs implements Closeable {
         channel.position(seekTo);
         is = Channels.newInputStream(channel);
 
-        // we (likely) seeked to the middle of a line:
-        skipFirstLineFragment = true;
+        // read until newline char, otherwise we may hit "java.nio.charset.MalformedInputException: Input length = 1"
+        // exception in readline() below, because we seeked part way through a multi-byte (in UTF-8) encoded
+        // unicode character:
+        if (seekTo > 0L) {
+          int b;
+          do {
+            b = is.read();
+          } while (b >= 0 && b != 13 && b != 10);
+        }
 
         needSkip = false;
       }
@@ -169,11 +175,6 @@ public class LineFileDocs implements Closeable {
         .onMalformedInput(CodingErrorAction.REPORT)
         .onUnmappableCharacter(CodingErrorAction.REPORT);
     reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
-    
-    if (skipFirstLineFragment) {
-      // read until end of line:
-      reader.readLine();
-    }
   }
 
   public synchronized void reset() throws IOException {