You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2020/05/04 17:29:23 UTC

[lucene-solr] branch master updated: LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 1783c4a  LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character
1783c4a is described below

commit 1783c4ad47990d1a88ac3bb44b2da2c2d2abcc79
Author: Mike McCandless <mi...@apache.org>
AuthorDate: Mon May 4 13:29:00 2020 -0400

    LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character
---
 .../src/java/org/apache/lucene/util/LineFileDocs.java   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java b/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java
index fa409d1..6d0c4bf 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java
@@ -88,7 +88,6 @@ public class LineFileDocs implements Closeable {
 
     // true if the InputStream is not already randomly seek'd after the if/else block below:
     boolean needSkip;
-    boolean skipFirstLineFragment = false;
     
     long size = 0L, seekTo = 0L;
     if (is == null) {
@@ -109,8 +108,15 @@ public class LineFileDocs implements Closeable {
         channel.position(seekTo);
         is = Channels.newInputStream(channel);
 
-        // we (likely) seeked to the middle of a line:
-        skipFirstLineFragment = true;
+        // read until newline char, otherwise we may hit "java.nio.charset.MalformedInputException: Input length = 1"
+        // exception in readline() below, because we seeked part way through a multi-byte (in UTF-8) encoded
+        // unicode character:
+        if (seekTo > 0L) {
+          int b;
+          do {
+            b = is.read();
+          } while (b >= 0 && b != 13 && b != 10);
+        }
 
         needSkip = false;
       }
@@ -169,11 +175,6 @@ public class LineFileDocs implements Closeable {
         .onMalformedInput(CodingErrorAction.REPORT)
         .onUnmappableCharacter(CodingErrorAction.REPORT);
     reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
-    
-    if (skipFirstLineFragment) {
-      // read until end of line:
-      reader.readLine();
-    }
   }
 
   public synchronized void reset() throws IOException {