You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by bo...@apache.org on 2020/01/01 17:11:36 UTC
[commons-compress] 04/06: Merge branch 'COMPRESS-124' of https://github.com/PeterAlfreadLee/commons-compress into PeterAlfreadLee-COMPRESS-124

This is an automated email from the ASF dual-hosted git repository.

bodewig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-compress.git

commit 03b7d1ecec4811fe7e6ae63eacb1e63291eaaf6e
Merge: f7f7c53 50569e5
Author: Stefan Bodewig <bo...@apache.org>
AuthorDate: Wed Jan 1 17:26:07 2020 +0100

    Merge branch 'COMPRESS-124' of https://github.com/PeterAlfreadLee/commons-compress into PeterAlfreadLee-COMPRESS-124

 .../compress/archivers/tar/TarArchiveEntry.java    |  53 ++-
 .../archivers/tar/TarArchiveInputStream.java       | 417 +++++++++++++++++++--
 .../archivers/tar/TarArchiveSparseEntry.java       |  19 +
 .../archivers/tar/TarArchiveStructSparse.java      |  81 ++++
 .../compress/archivers/tar/TarConstants.java       |  24 ++
 .../commons/compress/archivers/tar/TarUtils.java   |  15 +-
 .../commons/compress/utils/BoundedInputStream.java |   9 +
 .../compress/archivers/tar/SparseFilesTest.java    | 205 +++++++++-
 .../archivers/tar/TarArchiveInputStreamTest.java   |  10 +-
 .../compress/archivers/tar/TarUtilsTest.java       |  12 +
 src/test/resources/oldgnu_extended_sparse.tar      | Bin 0 -> 10240 bytes
 11 files changed, 806 insertions(+), 39 deletions(-)

diff --cc src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
index 65b7e32,72b6653..0f6b70f
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
@@@ -66,8 -66,16 +66,16 @@@ public class TarArchiveInputStream exte
      private long entryOffset;
  
      /** An input stream to read from */
 -    private final InputStream is;
 +    private final InputStream inputStream;
  
+     /** Input streams for reading sparse entries **/
+     private List<InputStream> sparseInputStreams;
+ 
+     /** the index of current input stream being read when reading sparse entries */
+     private int currentSparseInputStreamIndex;
+ 
+     private InputStream sparseInputStream;
+ 
      /** The meta-data about the current entry */
      private TarArchiveEntry currEntry;
  
@@@ -185,7 -196,14 +196,14 @@@
       */
      @Override
      public void close() throws IOException {
+         // Close all the input streams in sparseInputStreams
+         if(sparseInputStreams != null) {
+             for (InputStream inputStream : sparseInputStreams) {
+                 inputStream.close();
+             }
+         }
+ 
 -        is.close();
 +        inputStream.close();
      }
  
      /**
@@@ -243,14 -262,48 +262,48 @@@
              return 0;
          }
  
-         final long available = entrySize - entryOffset;
-         final long skipped = IOUtils.skip(inputStream, Math.min(n, available));
 -        long available = currEntry.getRealSize() - entryOffset;
 -        long skipped;
++        final long available = currEntry.getRealSize() - entryOffset;
++        final long skipped;
+         if(!currEntry.isSparse()) {
 -            skipped = IOUtils.skip(is, Math.min(n, available));
++            skipped = IOUtils.skip(inputStream, Math.min(n, available));
+         } else {
+             skipped = skipSparse(n);
+         }
          count(skipped);
          entryOffset += skipped;
          return skipped;
      }
  
      /**
+      * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip,
+      * jump to the next input stream and skip the rest bytes, keep doing this until total n bytes are skipped
+      * or the input streams are all skipped
+      *
+      * @param n bytes of data to skip
+      * @return actual bytes of data skipped
+      * @throws IOException
+      */
+     private long skipSparse(final long n) throws IOException {
+         if (sparseInputStreams.size() == 0) {
 -            return is.skip(n);
++            return inputStream.skip(n);
+         }
+ 
+         long bytesSkipped = 0;
+         InputStream currentInputStream;
+ 
+         while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) {
+             currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
+             bytesSkipped += currentInputStream.skip(n - bytesSkipped);
+ 
+             if (bytesSkipped < n) {
+                 currentSparseInputStreamIndex++;
+             }
+         }
+ 
+         return bytesSkipped;
+     }
+ 
+     /**
       * Since we do not support marking just yet, we return false.
       *
       * @return False.
@@@ -470,20 -523,164 +523,164 @@@
          getNextEntry(); // Get the actual file entry
      }
  
+     /**
+      * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
+      * may appear multi times, and they look like:
+      *
+      * GNU.sparse.size=size
+      * GNU.sparse.numblocks=numblocks
+      * repeat numblocks times
+      *   GNU.sparse.offset=offset
+      *   GNU.sparse.numbytes=numbytes
+      * end repeat
+      *
+      *
+      * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+      *
+      * GNU.sparse.map
+      *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+      *
+      *
+      * For PAX Format 1.X:
+      * The sparse map itself is stored in the file data block, preceding the actual file data.
+      * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
+      * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
+      * giving the offset and size of the data block it describes.
+      * @throws IOException
+      */
      private void paxHeaders() throws IOException{
-         final Map<String, String> headers = parsePaxHeaders(this);
+         List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
+         final Map<String, String> headers = parsePaxHeaders(this, sparseHeaders);
+ 
+         // for 0.1 PAX Headers
+         if (headers.containsKey("GNU.sparse.map")) {
+             sparseHeaders = parsePAX01SparseHeaders(headers.get("GNU.sparse.map"));
+         }
          getNextEntry(); // Get the actual file entry
-         applyPaxHeadersToCurrentEntry(headers);
+         applyPaxHeadersToCurrentEntry(headers, sparseHeaders);
+ 
+         // for 1.0 PAX Format, the sparse map is stored in the file data block
+         if(currEntry.isPaxGNU1XSparse()) {
+             sparseHeaders = parsePAX1XSparseHeaders();
+             currEntry.setSparseHeaders(sparseHeaders);
+         }
+ 
+         // sparse headers are all done reading, we need to build
+         // sparse input streams using these sparse headers
+         buildSparseInputStreams();
      }
  
-     // NOTE, using a Map here makes it impossible to ever support GNU
-     // sparse files using the PAX Format 0.0, see
-     // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188
-     Map<String, String> parsePaxHeaders(final InputStream inputStream)
+     /**
+      * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+      * GNU.sparse.map
+      *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+      *
+      * @param sparseMap the sparse map string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+      * @return sparse headers parsed from sparse map
+      * @throws IOException
+      */
+     private List<TarArchiveStructSparse> parsePAX01SparseHeaders(String sparseMap) throws IOException {
+         List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
+         String[] sparseHeaderStrings = sparseMap.split(",");
+ 
+         for (int i = 0; i < sparseHeaderStrings.length;i += 2) {
+             long sparseOffset = Long.parseLong(sparseHeaderStrings[i]);
+             long sparseNumbytes = Long.parseLong(sparseHeaderStrings[i + 1]);
+             sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes));
+         }
+ 
+         return sparseHeaders;
+     }
+ 
+     /**
+      * For PAX Format 1.X:
+      * The sparse map itself is stored in the file data block, preceding the actual file data.
+      * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
+      * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
+      * giving the offset and size of the data block it describes.
+      * @return sparse headers
+      * @throws IOException
+      */
+     private List<TarArchiveStructSparse> parsePAX1XSparseHeaders() throws IOException {
+         // for 1.X PAX Headers
+         List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
+         long bytesRead = 0;
+         long[] readResult;
+         long sparseHeadersCount;
+ 
 -        readResult = readLineOfNumberForPax1X(is);
++        readResult = readLineOfNumberForPax1X(inputStream);
+         sparseHeadersCount = readResult[0];
+         bytesRead += readResult[1];
+         while (sparseHeadersCount-- > 0) {
 -            readResult = readLineOfNumberForPax1X(is);
++            readResult = readLineOfNumberForPax1X(inputStream);
+             long sparseOffset = readResult[0];
+             bytesRead += readResult[1];
+ 
 -            readResult = readLineOfNumberForPax1X(is);
++            readResult = readLineOfNumberForPax1X(inputStream);
+             long sparseNumbytes = readResult[0];
+             bytesRead += readResult[1];
+             sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes));
+         }
+ 
+         // skip the rest of this record data
+         long bytesToSkip = recordSize - bytesRead % recordSize;
 -        IOUtils.skip(is, bytesToSkip);
++        IOUtils.skip(inputStream, bytesToSkip);
+         return sparseHeaders;
+     }
+ 
+     /**
+      * For 1.X PAX Format, the sparse headers are stored in the file data block, preceding the actual file data.
+      * It consists of a series of decimal numbers delimited by newlines.
+      *
+      * @param inputStream the input stream of the tar file
+      * @return the decimal number delimited by '\n', and the bytes read from input stream
+      * @throws IOException
+      */
+     private long[] readLineOfNumberForPax1X(InputStream inputStream) throws IOException {
+         int number;
+         long result = 0;
+         long bytesRead = 0;
+ 
+         while((number = inputStream.read()) != '\n') {
+             bytesRead += 1;
+             if(number == -1) {
+                 throw new IOException("Unexpected EOF when reading parse information of 1.X PAX format");
+             }
+             result = result * 10 + (number - '0');
+         }
+         bytesRead += 1;
+ 
+         return new long[] {result, bytesRead};
+     }
+ 
+     /**
+      * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
+      * may appear multi times, and they look like:
+      *
+      * GNU.sparse.size=size
+      * GNU.sparse.numblocks=numblocks
+      * repeat numblocks times
+      *   GNU.sparse.offset=offset
+      *   GNU.sparse.numbytes=numbytes
+      * end repeat
+      *
+      * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+      *
+      * GNU.sparse.map
+      *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+      *
+      * @param i                 inputstream to read keys and values
+      * @param sparseHeaders used in PAX Format 0.0 & 0.1, as it may appear multi times,
+      *                      the sparse headers need to be stored in an array, not a map
+      * @return
+      * @throws IOException
+      */
 -    Map<String, String> parsePaxHeaders(final InputStream i, List<TarArchiveStructSparse> sparseHeaders)
++    Map<String, String> parsePaxHeaders(final InputStream inputStream, List<TarArchiveStructSparse> sparseHeaders)
          throws IOException {
          final Map<String, String> headers = new HashMap<>(globalPaxHeaders);
+         TarArchiveStructSparse sparseHeader = null;
          // Format is "length keyword=value\n";
--        while(true){ // get length
++        while(true) { // get length
              int ch;
              int len = 0;
              int read = 0;
@@@ -637,9 -846,25 +846,25 @@@
              throw new IllegalStateException("No current tar entry");
          }
  
+         if (!currEntry.isSparse()) {
+             if (entryOffset >= entrySize) {
+                 return -1;
+             }
+         } else {
+             // for sparse entries, there are actually currEntry.getRealSize() bytes to read
+             if (entryOffset >= currEntry.getRealSize()) {
+                 return -1;
+             }
+         }
+ 
          numToRead = Math.min(numToRead, available());
  
-         totalRead = inputStream.read(buf, offset, numToRead);
+         if (currEntry.isSparse()) {
+             // for sparse entries, we need to read them in another way
+             totalRead = readSparse(buf, offset, numToRead);
+         } else {
 -            totalRead = is.read(buf, offset, numToRead);
++            totalRead = inputStream.read(buf, offset, numToRead);
+         }
  
          if (totalRead == -1) {
              if (numToRead > 0) {
@@@ -655,6 -880,61 +880,61 @@@
      }
  
      /**
+      * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is
+      * stored in tar files, and they are stored separately. The structure of non-zero data is introduced by the
+      * sparse headers using the offset, where a block of non-zero data starts, and numbytes, the length of the
+      * non-zero data block.
+      * When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together
+      * according to the sparse headers.
+      *
+      * @param buf The buffer into which to place bytes read.
+      * @param offset The offset at which to place bytes read.
+      * @param numToRead The number of bytes to read.
+      * @return The number of bytes read, or -1 at EOF.
+      * @throws IOException on error
+      */
+     private int readSparse(final byte[] buf, final int offset, int numToRead) throws IOException {
+         // if there are no actual input streams, just read from the original input stream
+         if (sparseInputStreams.size() == 0) {
 -            return is.read(buf, offset, numToRead);
++            return inputStream.read(buf, offset, numToRead);
+         }
+ 
+         if(currentSparseInputStreamIndex >= sparseInputStreams.size()) {
+             return -1;
+         }
+ 
+         InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
+         int readLen = currentInputStream.read(buf, offset, numToRead);
+ 
+         // if the current input stream is the last input stream,
+         // just return the number of bytes read from current input stream
+         if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
+             return readLen;
+         }
+ 
+         // if EOF of current input stream is meet, open a new input stream and recursively call read
+         if (readLen == -1) {
+             currentSparseInputStreamIndex++;
+             return readSparse(buf, offset, numToRead);
+         }
+ 
+         // if the rest data of current input stream is not long enough, open a new input stream
+         // and recursively call read
+         if (readLen < numToRead) {
+             currentSparseInputStreamIndex++;
+             int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
+             if (readLenOfNext == -1) {
+                 return readLen;
+             }
+ 
+             return readLen + readLenOfNext;
+         }
+ 
+         // if the rest data of current input stream is enough(which means readLen == len), just return readLen
+         return readLen;
+     }
+ 
+     /**
       * Whether this class is able to read the given entry.
       *
       * <p>May return false if the current entry is a sparse file.</p>
@@@ -745,4 -1025,85 +1025,85 @@@
                          signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
      }
  
+     /**
+      * Build the input streams consisting of all-zero input streams and non-zero input streams.
+      * When reading from the non-zero input streams, the data is actually read from the original input stream.
+      * The size of each input stream is introduced by the sparse headers.
+      *
+      * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the
+      *        0 size input streams because they are meaningless.
+      */
+     private void buildSparseInputStreams() throws IOException {
+         currentSparseInputStreamIndex = -1;
+         sparseInputStreams = new ArrayList<>();
+         InputStream zeroInputStream = new TarArchiveSparseZeroInputStream();
+ 
+         long offset = 0;
+         List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders();
+         // sort the sparse headers in case they are written in wrong order
+         if (sparseHeaders != null && sparseHeaders.size() > 1) {
+             final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>() {
+                 @Override
+                 public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse q) {
+                     Long pOffset = p.getOffset();
+                     Long qOffset = q.getOffset();
+                     return pOffset.compareTo(qOffset);
+                 }
+             };
+             Collections.sort(sparseHeaders, sparseHeaderComparator);
+         }
+ 
+         for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
+             if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0) {
+                 break;
+             }
+ 
+             if ((sparseHeader.getOffset() - offset) < 0) {
+                 throw new IOException("Corrupted struct sparse detected");
+             }
+ 
+             // only store the input streams with non-zero size
+             if ((sparseHeader.getOffset() - offset) > 0) {
+                 sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset));
+             }
+ 
+             // only store the input streams with non-zero size
+             if (sparseHeader.getNumbytes() > 0) {
 -                sparseInputStreams.add(new BoundedInputStream(is, sparseHeader.getNumbytes()));
++                sparseInputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes()));
+             }
+ 
+             offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
+         }
+ 
+         if (sparseInputStreams.size() > 0) {
+             currentSparseInputStreamIndex = 0;
+         }
+     }
+ 
+     /**
+      * This is an inputstream that always return 0,
+      * this is used when reading the "holes" of a sparse file
+      */
+     public class TarArchiveSparseZeroInputStream extends InputStream {
+         /**
+          * Just return 0
+          * @return
+          * @throws IOException
+          */
+         @Override
+         public int read() throws IOException {
+             return 0;
+         }
+ 
+         /**
+          * these's nothing need to do when skipping
+          *
+          * @param n bytes to skip
+          * @return bytes actually skipped
+          */
+         @Override
+         public long skip(final long n) {
+             return n;
+         }
+     }
  }