You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by ko...@apache.org on 2019/10/28 19:36:47 UTC

[avro] branch master updated: AVRO-2432: Handle Empty Datafiles (#691)

This is an automated email from the ASF dual-hosted git repository.

kojiromike pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/master by this push:
     new 7d87cde  AVRO-2432: Handle Empty Datafiles (#691)
7d87cde is described below

commit 7d87cde06253913e9ef23c671e9ce092bc35dfe3
Author: Michael A. Smith <mi...@smith-li.com>
AuthorDate: Mon Oct 28 15:35:53 2019 -0400

    AVRO-2432: Handle Empty Datafiles (#691)
---
 lang/py/src/avro/datafile.py         | 15 ++++-----------
 lang/py/test/test_datafile.py        | 14 ++++++++++++++
 lang/py3/avro/datafile.py            | 19 ++++---------------
 lang/py3/avro/tests/test_datafile.py | 15 +++++++++++++++
 4 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/lang/py/src/avro/datafile.py b/lang/py/src/avro/datafile.py
index 75a8e4a..0d29a6a 100644
--- a/lang/py/src/avro/datafile.py
+++ b/lang/py/src/avro/datafile.py
@@ -373,21 +373,14 @@ class DataFileReader(object):
     if proposed_sync_marker != self.sync_marker:
       self.reader.seek(-SYNC_SIZE, 1)
       return False
-    else:
-      return True
+    return True
 
-  # TODO(hammer): handle block of length zero
-  # TODO(hammer): clean this up with recursion
   def next(self):
     """Return the next datum in the file."""
-    if self.block_count == 0:
-      if self.is_EOF():
+    while self.block_count == 0:
+      if self.is_EOF() or (self._skip_sync() and self.is_EOF()):
         raise StopIteration
-      elif self._skip_sync():
-        if self.is_EOF(): raise StopIteration
-        self._read_block_header()
-      else:
-        self._read_block_header()
+      self._read_block_header()
 
     datum = self.datum_reader.read(self.datum_decoder)
     self.block_count -= 1
diff --git a/lang/py/test/test_datafile.py b/lang/py/test/test_datafile.py
index bceb071..b020222 100644
--- a/lang/py/test/test_datafile.py
+++ b/lang/py/test/test_datafile.py
@@ -203,5 +203,19 @@ class TestDataFile(unittest.TestCase):
         datums.append(datum)
     self.assertTrue(reader.closed)
 
+  def test_empty_datafile(self):
+    """A reader should not fail to read a file consisting of a single empty block."""
+    sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0])
+    with datafile.DataFileWriter(open(FILENAME, 'wb'), io.DatumWriter(),
+        sample_schema) as dfw:
+      dfw.flush()
+      # Write an empty block
+      dfw.encoder.write_long(0)
+      dfw.encoder.write_long(0)
+      dfw.writer.write(dfw.sync_marker)
+
+    with datafile.DataFileReader(open(FILENAME, 'rb'), io.DatumReader()) as dfr:
+      self.assertEqual([], list(dfr))
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/lang/py3/avro/datafile.py b/lang/py3/avro/datafile.py
index 6a3d46e..9a671f1 100644
--- a/lang/py3/avro/datafile.py
+++ b/lang/py3/avro/datafile.py
@@ -403,10 +403,6 @@ class DataFileReader(object):
   def __iter__(self):
     return self
 
-  def __next__(self):
-    """Implements the iterator interface."""
-    return next(self)
-
   # read-only properties
   @property
   def reader(self):
@@ -555,21 +551,14 @@ class DataFileReader(object):
     if proposed_sync_marker != self.sync_marker:
       self.reader.seek(-SYNC_SIZE, 1)
       return False
-    else:
-      return True
+    return True
 
-  # TODO: handle block of length zero
-  # TODO: clean this up with recursion
   def __next__(self):
     """Return the next datum in the file."""
-    if self.block_count == 0:
-      if self.is_EOF():
+    while self.block_count == 0:
+      if self.is_EOF() or (self._skip_sync() and self.is_EOF()):
         raise StopIteration
-      elif self._skip_sync():
-        if self.is_EOF(): raise StopIteration
-        self._read_block_header()
-      else:
-        self._read_block_header()
+      self._read_block_header()
 
     datum = self.datum_reader.read(self.datum_decoder)
     self._block_count -= 1
diff --git a/lang/py3/avro/tests/test_datafile.py b/lang/py3/avro/tests/test_datafile.py
index 3342098..9ad6a32 100644
--- a/lang/py3/avro/tests/test_datafile.py
+++ b/lang/py3/avro/tests/test_datafile.py
@@ -287,6 +287,21 @@ class TestDataFile(unittest.TestCase):
           datums.append(datum)
       self.assertTrue(reader.closed)
 
+  def test_empty_datafile(self):
+    """A reader should not fail to read a file consisting of a single empty block."""
+    file_path = self.NewTempFile()
+    sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0])
+    with datafile.DataFileWriter(open(file_path, 'wb'), io.DatumWriter(),
+        sample_schema) as dfw:
+      dfw.flush()
+      # Write an empty block
+      dfw.encoder.write_long(0)
+      dfw.encoder.write_long(0)
+      dfw.writer.write(dfw.sync_marker)
+
+    with datafile.DataFileReader(open(file_path, 'rb'), io.DatumReader()) as dfr:
+      self.assertEqual([], list(dfr))
+
 
 # ------------------------------------------------------------------------------