You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/29 15:31:38 UTC

arrow git commit: ARROW-1619: [Java] Set lastSet in JsonFileReader

Repository: arrow
Updated Branches:
  refs/heads/master 732367766 -> f8cf91d25


ARROW-1619: [Java] Set lastSet in JsonFileReader

When reading a vector in JsonFileReader, lastSet should be set in VariableWidthVectors after reading inner vectors or else subsequent operations could corrupt the offsets.  This also allows to simplify some of the related code.  Additionally, ListVector.lastSet should be explicitly initialized to 0, which is it's starting offset.

Author: Bryan Cutler <cu...@gmail.com>

Closes #1140 from BryanCutler/java-JsonReader-setLast-ARROW-1619 and squashes the following commits:

8f97a3db [Bryan Cutler] added test for VarBinaryVector that checks lastSet after reading
70df0cc4 [Bryan Cutler] set lastSet in JsonFileReader and initialize lastSet for ListVector


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f8cf91d2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f8cf91d2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f8cf91d2

Branch: refs/heads/master
Commit: f8cf91d25eed5aebebb4aaec61fb9b2af9e97bb1
Parents: 7323677
Author: Bryan Cutler <cu...@gmail.com>
Authored: Fri Sep 29 10:31:33 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Sep 29 10:31:33 2017 -0500

----------------------------------------------------------------------
 .../apache/arrow/vector/complex/ListVector.java |  2 +-
 .../arrow/vector/file/json/JsonFileReader.java  | 46 +++++++++++-------
 .../apache/arrow/vector/file/BaseFileTest.java  | 49 ++++++++++++++++++++
 .../apache/arrow/vector/file/TestArrowFile.java | 41 ++++++++++++++++
 .../arrow/vector/file/json/TestJSONFile.java    | 29 ++++++++++++
 5 files changed, 149 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/f8cf91d2/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 470317f..6511efc 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -368,7 +368,7 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector,
     return vector;
   }
 
-  private int lastSet;
+  private int lastSet = 0;
 
   public class Accessor extends BaseRepeatedAccessor {
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/f8cf91d2/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
index 8bb0f26..c6ebd61 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
@@ -44,6 +44,8 @@ import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
 import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.NullableVarBinaryVector;
+import org.apache.arrow.vector.NullableVarCharVector;
 import org.apache.arrow.vector.SmallIntVector;
 import org.apache.arrow.vector.TimeMicroVector;
 import org.apache.arrow.vector.TimeMilliVector;
@@ -63,10 +65,10 @@ import org.apache.arrow.vector.UInt2Vector;
 import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.ValueVector;
-import org.apache.arrow.vector.ValueVector.Mutator;
 import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.dictionary.Dictionary;
 import org.apache.arrow.vector.dictionary.DictionaryProvider;
 import org.apache.arrow.vector.schema.ArrowVectorType;
@@ -84,7 +86,6 @@ import com.fasterxml.jackson.databind.MappingJsonFactory;
 import com.google.common.base.Objects;
 
 public class JsonFileReader implements AutoCloseable, DictionaryProvider {
-  private final File inputFile;
   private final JsonParser parser;
   private final BufferAllocator allocator;
   private Schema schema;
@@ -93,7 +94,6 @@ public class JsonFileReader implements AutoCloseable, DictionaryProvider {
 
   public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException {
     super();
-    this.inputFile = inputFile;
     this.allocator = allocator;
     MappingJsonFactory jsonFactory = new MappingJsonFactory();
     this.parser = jsonFactory.createParser(inputFile);
@@ -216,10 +216,9 @@ public class JsonFileReader implements AutoCloseable, DictionaryProvider {
     }
   }
 
-  /*
-   * TODO: This method doesn't load some vectors correctly. For instance, it doesn't initialize
-   * `lastSet` in ListVector, VarCharVector, NullableVarBinaryVector A better way of implementing
-   * this function is to use `loadFieldBuffers` methods in FieldVector.
+  /**
+   * TODO: A better way of implementing this function is to use `loadFieldBuffers` methods in
+   * FieldVector to set the inner-vector data as done in `ArrowFileReader`.
    */
   private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException {
     List<ArrowVectorType> vectorTypes = field.getTypeLayout().getVectorTypes();
@@ -234,29 +233,42 @@ public class JsonFileReader implements AutoCloseable, DictionaryProvider {
       if (started && !Objects.equal(field.getName(), name)) {
         throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name);
       }
+
+      // Initialize the vector with required capacity
       int count = readNextField("count", Integer.class);
+      vector.setInitialCapacity(count);
       vector.allocateNew();
-      vector.getMutator().setValueCount(count);
+
+      // Read inner vectors
       for (int v = 0; v < vectorTypes.size(); v++) {
         ArrowVectorType vectorType = vectorTypes.get(v);
-        BufferBacked innerVector = fieldInnerVectors.get(v);
+        ValueVector valueVector = (ValueVector) fieldInnerVectors.get(v);
         nextFieldIs(vectorType.getName());
         readToken(START_ARRAY);
-        ValueVector valueVector = (ValueVector) innerVector;
-
         int innerVectorCount = vectorType.equals(OFFSET) ? count + 1 : count;
-        valueVector.setInitialCapacity(innerVectorCount);
-        valueVector.allocateNew();
-
         for (int i = 0; i < innerVectorCount; i++) {
           parser.nextToken();
           setValueFromParser(valueVector, i);
         }
-        Mutator mutator = valueVector.getMutator();
-        mutator.setValueCount(innerVectorCount);
         readToken(END_ARRAY);
       }
-      // if children
+
+      // Set lastSet before valueCount to prevent setValueCount from filling empty values
+      switch (vector.getMinorType()) {
+        case LIST:
+          // ListVector starts lastSet from index 0, so lastSet value is always last index written + 1
+          ((ListVector) vector).getMutator().setLastSet(count);
+          break;
+        case VARBINARY:
+          ((NullableVarBinaryVector) vector).getMutator().setLastSet(count - 1);
+          break;
+        case VARCHAR:
+          ((NullableVarCharVector) vector).getMutator().setLastSet(count - 1);
+          break;
+      }
+      vector.getMutator().setValueCount(count);
+
+      // read child vectors, if any
       List<Field> fields = field.getChildren();
       if (!fields.isEmpty()) {
         List<FieldVector> vectorChildren = vector.getChildrenFromFields();

http://git-wip-us.apache.org/repos/asf/arrow/blob/f8cf91d2/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java
index c05d590..ba62de0 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java
@@ -32,6 +32,7 @@ import org.apache.arrow.vector.NullableDateMilliVector;
 import org.apache.arrow.vector.NullableDecimalVector;
 import org.apache.arrow.vector.NullableIntVector;
 import org.apache.arrow.vector.NullableTimeMilliVector;
+import org.apache.arrow.vector.NullableVarBinaryVector;
 import org.apache.arrow.vector.NullableVarCharVector;
 import org.apache.arrow.vector.ValueVector.Accessor;
 import org.apache.arrow.vector.VectorSchemaRoot;
@@ -541,4 +542,52 @@ public class BaseFileTest {
     writer.setValueCount(count);
     varchar.release();
   }
+
+  protected void writeVarBinaryData(int count, NullableMapVector parent) {
+    Assert.assertTrue(count < 100);
+    ComplexWriter writer = new ComplexWriterImpl("root", parent);
+    MapWriter rootWriter = writer.rootAsMap();
+    ListWriter listWriter = rootWriter.list("list");
+    ArrowBuf varbin = allocator.buffer(count);
+    for (int i = 0; i < count; i++) {
+      varbin.setByte(i, i);
+      listWriter.setPosition(i);
+      listWriter.startList();
+      for (int j = 0; j < i % 3; j++) {
+        listWriter.varBinary().writeVarBinary(0, i + 1, varbin);
+      }
+      listWriter.endList();
+    }
+    writer.setValueCount(count);
+    varbin.release();
+  }
+
+  protected void validateVarBinary(int count, VectorSchemaRoot root) {
+    Assert.assertEquals(count, root.getRowCount());
+    ListVector listVector = (ListVector) root.getVector("list");
+    byte[] expectedArray = new byte[count];
+    int numVarBinaryValues = 0;
+    for (int i = 0; i < count; i++) {
+      expectedArray[i] = (byte) i;
+      Object obj = listVector.getAccessor().getObject(i);
+      List<?> objList = (List) obj;
+      if (i % 3 == 0) {
+        Assert.assertTrue(objList.isEmpty());
+      } else {
+        byte[] expected = Arrays.copyOfRange(expectedArray, 0, i + 1);
+        for (int j = 0; j < i % 3; j++) {
+          byte[] result = (byte[]) objList.get(j);
+          Assert.assertArrayEquals(result, expected);
+          numVarBinaryValues++;
+        }
+      }
+    }
+
+    // ListVector lastSet should be the index of last value + 1
+    Assert.assertEquals(listVector.getMutator().getLastSet(), count);
+
+    // NullableVarBinaryVector lastSet should be the index of last value
+    NullableVarBinaryVector binaryVector = (NullableVarBinaryVector) listVector.getChildrenFromFields().get(0);
+    Assert.assertEquals(binaryVector.getMutator().getLastSet(), numVarBinaryValues - 1);
+  }
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/f8cf91d2/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java
index c483ba7..81e5898 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java
@@ -622,6 +622,47 @@ public class TestArrowFile extends BaseFileTest {
     }
   }
 
+  @Test
+  public void testWriteReadVarBin() throws IOException {
+    File file = new File("target/mytest_varbin.arrow");
+    ByteArrayOutputStream stream = new ByteArrayOutputStream();
+    int count = COUNT;
+
+    // write
+    try (
+        BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE);
+        NullableMapVector parent = NullableMapVector.empty("parent", vectorAllocator)) {
+      writeVarBinaryData(count, parent);
+      VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root"));
+      validateVarBinary(count, root);
+      write(parent.getChild("root"), file, stream);
+    }
+
+    // read from file
+    try (
+        BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE);
+        FileInputStream fileInputStream = new FileInputStream(file);
+        ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator)) {
+      VectorSchemaRoot root = arrowReader.getVectorSchemaRoot();
+      Schema schema = root.getSchema();
+      LOGGER.debug("reading schema: " + schema);
+      Assert.assertTrue(arrowReader.loadNextBatch());
+      validateVarBinary(count, root);
+    }
+
+    // read from stream
+    try (
+        BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE);
+        ByteArrayInputStream input = new ByteArrayInputStream(stream.toByteArray());
+        ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator)) {
+      VectorSchemaRoot root = arrowReader.getVectorSchemaRoot();
+      Schema schema = root.getSchema();
+      LOGGER.debug("reading schema: " + schema);
+      Assert.assertTrue(arrowReader.loadNextBatch());
+      validateVarBinary(count, root);
+    }
+  }
+
 
   /**
    * Writes the contents of parents to file. If outStream is non-null, also writes it

http://git-wip-us.apache.org/repos/asf/arrow/blob/f8cf91d2/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java
----------------------------------------------------------------------
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java
index 960567f..ee90d34 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java
@@ -285,4 +285,33 @@ public class TestJSONFile extends BaseFileTest {
     }
   }
 
+  @Test
+  public void testWriteReadVarBinJSON() throws IOException {
+    File file = new File("target/mytest_varbin.json");
+    int count = COUNT;
+
+    // write
+    try (
+        BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE);
+        NullableMapVector parent = NullableMapVector.empty("parent", vectorAllocator)) {
+      writeVarBinaryData(count, parent);
+      VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root"));
+      validateVarBinary(count, root);
+      writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null);
+    }
+
+    // read
+    try (
+        BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE)) {
+      JsonFileReader reader = new JsonFileReader(file, readerAllocator);
+      Schema schema = reader.start();
+      LOGGER.debug("reading schema: " + schema);
+
+      // initialize vectors
+      try (VectorSchemaRoot root = reader.read();) {
+        validateVarBinary(count, root);
+      }
+      reader.close();
+    }
+  }
 }