You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/03/26 08:45:33 UTC

lucene-solr:master: LUCENE-7141: switch OfflineSorter's ByteSequencesReader to BytesRefIterator

Repository: lucene-solr
Updated Branches:
  refs/heads/master c93c88dfb -> 78d5cfefe


LUCENE-7141: switch OfflineSorter's ByteSequencesReader to BytesRefIterator


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/78d5cfef
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/78d5cfef
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/78d5cfef

Branch: refs/heads/master
Commit: 78d5cfefe2453345c498984bf0e405d254a9d5bc
Parents: c93c88d
Author: Mike McCandless <mi...@apache.org>
Authored: Sat Mar 26 03:47:06 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Mar 26 03:47:06 2016 -0400

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  5 +++
 .../lucene/analysis/hunspell/Dictionary.java    | 13 ++++---
 .../org/apache/lucene/util/OfflineSorter.java   | 40 ++++++++++----------
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 13 +++----
 .../search/suggest/SortedInputIterator.java     |  6 +--
 .../suggest/analyzing/AnalyzingSuggester.java   | 16 +++++---
 .../search/suggest/fst/ExternalRefSorter.java   | 18 +++------
 .../search/suggest/fst/FSTCompletionLookup.java | 18 +++++----
 .../search/suggest/fst/BytesRefSortersTest.java | 11 +++---
 9 files changed, 72 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bc272e4..a3368ff 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -16,6 +16,11 @@ New Features
 * LUCENE-7140: Add PlanetModel.bisection to spatial3d (Karl Wright via
   Mike McCandless)
 
+API Changes
+
+* LUCENE-7141: Switch OfflineSorter's ByteSequencesReader to
+  BytesRefIterator (Mike McCandless)
+
 Optimizations
 
 * LUCENE-7071: Reduce bytes copying in OfflineSorter, giving ~10%

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 562e5cb..7885daa 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -888,17 +888,20 @@ public class Dictionary {
     boolean success2 = false;
     
     try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
-      BytesRefBuilder scratchLine = new BytesRefBuilder();
     
       // TODO: the flags themselves can be double-chars (long) or also numeric
       // either way the trick is to encode them as char... but they must be parsed differently
     
       String currentEntry = null;
       IntsRefBuilder currentOrds = new IntsRefBuilder();
-    
-      String line;
-      while (reader.read(scratchLine)) {
-        line = scratchLine.get().utf8ToString();
+
+      while (true) {
+        BytesRef scratch = reader.next();
+        if (scratch == null) {
+          break;
+        }
+        
+        String line = scratch.utf8ToString();
         String entry;
         char wordForm[];
         int end;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java b/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
index 7549cbc..386cd0c 100644
--- a/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java
@@ -346,7 +346,7 @@ public class OfflineSorter {
     PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segmentsToMerge.size()) {
       @Override
       protected boolean lessThan(FileAndTop a, FileAndTop b) {
-        return comparator.compare(a.current.get(), b.current.get()) < 0;
+        return comparator.compare(a.current, b.current) < 0;
       }
     };
 
@@ -361,15 +361,14 @@ public class OfflineSorter {
       // Open streams and read the top for each file
       for (int i = 0; i < segmentsToMerge.size(); i++) {
         streams[i] = getReader(dir.openChecksumInput(segmentsToMerge.get(i), IOContext.READONCE), segmentsToMerge.get(i));
-        BytesRefBuilder bytes = new BytesRefBuilder();
-        boolean result = false;
+        BytesRef item = null;
         try {
-          result = streams[i].read(bytes);
+          item = streams[i].next();
         } catch (Throwable t) {
           verifyChecksum(t, streams[i]);
         }
-        assert result;
-        queue.insertWithOverflow(new FileAndTop(i, bytes));
+        assert item != null;
+        queue.insertWithOverflow(new FileAndTop(i, item));
       }
   
       // Unix utility sort() uses ordered array of files to pick the next line from, updating
@@ -378,15 +377,14 @@ public class OfflineSorter {
       // so it shouldn't make much of a difference (didn't check).
       FileAndTop top;
       while ((top = queue.top()) != null) {
-        writer.write(top.current.bytes(), 0, top.current.length());
-        boolean result = false;
+        writer.write(top.current);
         try {
-          result = streams[top.fd].read(top.current);
+          top.current = streams[top.fd].next();
         } catch (Throwable t) {
           verifyChecksum(t, streams[top.fd]);
         }
 
-        if (result) {
+        if (top.current != null) {
           queue.updateTop();
         } else {
           queue.pop();
@@ -416,18 +414,17 @@ public class OfflineSorter {
   /** Read in a single partition of data */
   int readPartition(ByteSequencesReader reader) throws IOException {
     long start = System.currentTimeMillis();
-    final BytesRefBuilder scratch = new BytesRefBuilder();
     while (true) {
-      boolean result = false;
+      BytesRef item = null;
       try {
-        result = reader.read(scratch);
+        item = reader.next();
       } catch (Throwable t) {
         verifyChecksum(t, reader);
       }
-      if (result == false) {
+      if (item == null) {
         break;
       }
-      buffer.append(scratch.get());
+      buffer.append(item);
       // Account for the created objects.
       // (buffer slots do not account to buffer size.) 
       if (bufferBytesUsed.get() > ramBufferSize.bytes) {
@@ -440,9 +437,9 @@ public class OfflineSorter {
 
   static class FileAndTop {
     final int fd;
-    final BytesRefBuilder current;
+    BytesRef current;
 
-    FileAndTop(int fd, BytesRefBuilder firstLine) {
+    FileAndTop(int fd, BytesRef firstLine) {
       this.fd = fd;
       this.current = firstLine;
     }
@@ -518,10 +515,11 @@ public class OfflineSorter {
    * Utility class to read length-prefixed byte[] entries from an input.
    * Complementary to {@link ByteSequencesWriter}.
    */
-  public static class ByteSequencesReader implements Closeable {
+  public static class ByteSequencesReader implements BytesRefIterator, Closeable {
     protected final String name;
     protected final ChecksumIndexInput in;
     protected final long end;
+    private final BytesRefBuilder ref = new BytesRefBuilder();
 
     /** Constructs a ByteSequencesReader from the provided IndexInput */
     public ByteSequencesReader(ChecksumIndexInput in, String name) {
@@ -538,16 +536,16 @@ public class OfflineSorter {
      * the header of the next sequence. Returns <code>true</code> otherwise.
      * @throws EOFException if the file ends before the full sequence is read.
      */
-    public boolean read(BytesRefBuilder ref) throws IOException {
+    public BytesRef next() throws IOException {
       if (in.getFilePointer() >= end) {
-        return false;
+        return null;
       }
 
       short length = in.readShort();
       ref.grow(length);
       ref.setLength(length);
       in.readBytes(ref.bytes(), 0, length);
-      return true;
+      return ref.get();
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 5002e50..e075ced 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -33,7 +33,6 @@ import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.TrackingDirectoryWrapper;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntroSorter;
@@ -736,6 +735,8 @@ public class BKDWriter implements Closeable {
       // TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
       IndexOutput[] lastWriter = new IndexOutput[1];
 
+      final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);
+
       OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, OfflineSorter.BufferSize.megabytes(Math.max(1, (long) maxMBSortInHeap)), OfflineSorter.MAX_TEMPFILES) {
 
           /** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@@ -756,14 +757,12 @@ public class BKDWriter implements Closeable {
           protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
             return new ByteSequencesReader(in, name) {
               @Override
-              public boolean read(BytesRefBuilder ref) throws IOException {
+              public BytesRef next() throws IOException {
                 if (in.getFilePointer() >= end) {
-                  return false;
+                  return null;
                 }
-                ref.grow(bytesPerDoc);
-                in.readBytes(ref.bytes(), 0, bytesPerDoc);
-                ref.setLength(bytesPerDoc);
-                return true;
+                in.readBytes(scratch.bytes, 0, bytesPerDoc);
+                return scratch;
               }
             };
           }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
index 38fbbab..977df37 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java
@@ -29,7 +29,6 @@ import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
@@ -53,7 +52,6 @@ public class SortedInputIterator implements InputIterator {
   private boolean done = false;
   
   private long weight;
-  private final BytesRefBuilder scratch = new BytesRefBuilder();
   private BytesRef payload = new BytesRef();
   private Set<BytesRef> contexts = null;
   
@@ -86,8 +84,8 @@ public class SortedInputIterator implements InputIterator {
     }
     try {
       ByteArrayDataInput input = new ByteArrayDataInput();
-      if (reader.read(scratch)) {
-        final BytesRef bytes = scratch.get();
+      BytesRef bytes = reader.next();
+      if (bytes != null) {
         weight = decode(bytes, input);
         if (hasPayloads) {
           payload = decodePayload(bytes, input);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
index d7f17a0..19982a5 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@@ -509,8 +509,12 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
       Set<BytesRef> seenSurfaceForms = new HashSet<>();
 
       int dedup = 0;
-      while (reader.read(scratch)) {
-        input.reset(scratch.bytes(), 0, scratch.length());
+      while (true) {
+        BytesRef bytes = reader.next();
+        if (bytes == null) {
+          break;
+        }
+        input.reset(bytes.bytes, bytes.offset, bytes.length);
         short analyzedLength = input.readShort();
         analyzed.grow(analyzedLength+2);
         input.readBytes(analyzed.bytes(), 0, analyzedLength);
@@ -518,13 +522,13 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
 
         long cost = input.readInt();
 
-        surface.bytes = scratch.bytes();
+        surface.bytes = bytes.bytes;
         if (hasPayloads) {
           surface.length = input.readShort();
           surface.offset = input.getPosition();
         } else {
           surface.offset = input.getPosition();
-          surface.length = scratch.length() - surface.offset;
+          surface.length = bytes.length - surface.offset;
         }
         
         if (previousAnalyzed == null) {
@@ -566,11 +570,11 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
           builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
         } else {
           int payloadOffset = input.getPosition() + surface.length;
-          int payloadLength = scratch.length() - payloadOffset;
+          int payloadLength = bytes.length - payloadOffset;
           BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
           System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
           br.bytes[surface.length] = PAYLOAD_SEP;
-          System.arraycopy(scratch.bytes(), payloadOffset, br.bytes, surface.length+1, payloadLength);
+          System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
           br.length = br.bytes.length;
           builder.add(scratchInts.get(), outputs.newPair(cost, br));
         }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
index 9852eec..fb876d2 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java
@@ -24,7 +24,6 @@ import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.OfflineSorter;
@@ -105,9 +104,10 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
   /**
    * Iterate over byte refs in a file.
    */
+  // TODO: this class is a bit silly ... sole purpose is to "remove" Closeable from what #iterator returns:
   class ByteSequenceIterator implements BytesRefIterator {
     private final OfflineSorter.ByteSequencesReader reader;
-    private BytesRefBuilder scratch = new BytesRefBuilder();
+    private BytesRef scratch;
     
     public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader) {
       this.reader = reader;
@@ -115,20 +115,14 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
     
     @Override
     public BytesRef next() throws IOException {
-      if (scratch == null) {
-        return null;
-      }
       boolean success = false;
       try {
-        if (reader.read(scratch) == false) {
-          IOUtils.close(reader);
-          scratch = null;
-        }
-        success = true;
+        scratch = reader.next();
         if (scratch == null) {
-          return null;
+          reader.close();
         }
-        return scratch.get();
+        success = true;
+        return scratch;
       } finally {
         if (!success) {
           IOUtils.closeWhileHandlingException(reader);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
index df6b1c5..7db97a8 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
@@ -39,7 +39,6 @@ import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.OfflineSorter;
@@ -215,10 +214,13 @@ public class FSTCompletionLookup extends Lookup implements Accountable {
       int previousBucket = 0;
       int previousScore = 0;
       ByteArrayDataInput input = new ByteArrayDataInput();
-      BytesRefBuilder tmp1 = new BytesRefBuilder();
       BytesRef tmp2 = new BytesRef();
-      while (reader.read(tmp1)) {
-        input.reset(tmp1.bytes());
+      while (true) {
+        BytesRef scratch = reader.next();
+        if (scratch == null) {
+          break;
+        }
+        input.reset(scratch.bytes, scratch.offset, scratch.length);
         int currentScore = input.readInt();
 
         int bucket;
@@ -231,9 +233,9 @@ public class FSTCompletionLookup extends Lookup implements Accountable {
         previousBucket = bucket;
 
         // Only append the input, discard the weight.
-        tmp2.bytes = tmp1.bytes();
-        tmp2.offset = input.getPosition();
-        tmp2.length = tmp1.length() - input.getPosition();
+        tmp2.bytes = scratch.bytes;
+        tmp2.offset = scratch.offset + input.getPosition();
+        tmp2.length = scratch.length - input.getPosition();
         builder.add(tmp2, bucket);
 
         line++;
@@ -293,7 +295,7 @@ public class FSTCompletionLookup extends Lookup implements Accountable {
   @Override
   public synchronized boolean store(DataOutput output) throws IOException {
     output.writeVLong(count);
-    if (this.normalCompletion == null || normalCompletion.getFST() == null) {
+    if (normalCompletion == null || normalCompletion.getFST() == null) {
       return false;
     }
     normalCompletion.getFST().save(output);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78d5cfef/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
index b64d283..b26a2ad 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java
@@ -57,12 +57,13 @@ public class BytesRefSortersTest extends LuceneTestCase {
       sorter.add(new BytesRef(new byte [1]));
     });
 
-    BytesRef spare1;
-    BytesRef spare2;
-    while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) {
+    while (true) {
+      BytesRef spare1 = i1.next();
+      BytesRef spare2 = i2.next();
       assertEquals(spare1, spare2);
+      if (spare1 == null) {
+        break;
+      }
     }
-    assertNull(i1.next());
-    assertNull(i2.next());
   }  
 }