You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by bo...@apache.org on 2020/01/01 17:11:36 UTC
[commons-compress] 04/06: Merge branch 'COMPRESS-124' of
https://github.com/PeterAlfreadLee/commons-compress into
PeterAlfreadLee-COMPRESS-124
This is an automated email from the ASF dual-hosted git repository.
bodewig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-compress.git
commit 03b7d1ecec4811fe7e6ae63eacb1e63291eaaf6e
Merge: f7f7c53 50569e5
Author: Stefan Bodewig <bo...@apache.org>
AuthorDate: Wed Jan 1 17:26:07 2020 +0100
Merge branch 'COMPRESS-124' of https://github.com/PeterAlfreadLee/commons-compress into PeterAlfreadLee-COMPRESS-124
.../compress/archivers/tar/TarArchiveEntry.java | 53 ++-
.../archivers/tar/TarArchiveInputStream.java | 417 +++++++++++++++++++--
.../archivers/tar/TarArchiveSparseEntry.java | 19 +
.../archivers/tar/TarArchiveStructSparse.java | 81 ++++
.../compress/archivers/tar/TarConstants.java | 24 ++
.../commons/compress/archivers/tar/TarUtils.java | 15 +-
.../commons/compress/utils/BoundedInputStream.java | 9 +
.../compress/archivers/tar/SparseFilesTest.java | 205 +++++++++-
.../archivers/tar/TarArchiveInputStreamTest.java | 10 +-
.../compress/archivers/tar/TarUtilsTest.java | 12 +
src/test/resources/oldgnu_extended_sparse.tar | Bin 0 -> 10240 bytes
11 files changed, 806 insertions(+), 39 deletions(-)
diff --cc src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
index 65b7e32,72b6653..0f6b70f
--- a/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/tar/TarArchiveInputStream.java
@@@ -66,8 -66,16 +66,16 @@@ public class TarArchiveInputStream exte
private long entryOffset;
/** An input stream to read from */
- private final InputStream is;
+ private final InputStream inputStream;
+ /** Input streams for reading sparse entries **/
+ private List<InputStream> sparseInputStreams;
+
+ /** the index of current input stream being read when reading sparse entries */
+ private int currentSparseInputStreamIndex;
+
+ private InputStream sparseInputStream;
+
/** The meta-data about the current entry */
private TarArchiveEntry currEntry;
@@@ -185,7 -196,14 +196,14 @@@
*/
@Override
public void close() throws IOException {
+ // Close all the input streams in sparseInputStreams
+ if(sparseInputStreams != null) {
+ for (InputStream inputStream : sparseInputStreams) {
+ inputStream.close();
+ }
+ }
+
- is.close();
+ inputStream.close();
}
/**
@@@ -243,14 -262,48 +262,48 @@@
return 0;
}
- final long available = entrySize - entryOffset;
- final long skipped = IOUtils.skip(inputStream, Math.min(n, available));
- long available = currEntry.getRealSize() - entryOffset;
- long skipped;
++ final long available = currEntry.getRealSize() - entryOffset;
++ final long skipped;
+ if(!currEntry.isSparse()) {
- skipped = IOUtils.skip(is, Math.min(n, available));
++ skipped = IOUtils.skip(inputStream, Math.min(n, available));
+ } else {
+ skipped = skipSparse(n);
+ }
count(skipped);
entryOffset += skipped;
return skipped;
}
/**
+ * Skip n bytes from current input stream, if the current input stream doesn't have enough data to skip,
+ * jump to the next input stream and skip the rest bytes, keep doing this until total n bytes are skipped
+ * or the input streams are all skipped
+ *
+ * @param n bytes of data to skip
+ * @return actual bytes of data skipped
+ * @throws IOException
+ */
+ private long skipSparse(final long n) throws IOException {
+ if (sparseInputStreams.size() == 0) {
- return is.skip(n);
++ return inputStream.skip(n);
+ }
+
+ long bytesSkipped = 0;
+ InputStream currentInputStream;
+
+ while (bytesSkipped < n && currentSparseInputStreamIndex < sparseInputStreams.size()) {
+ currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
+ bytesSkipped += currentInputStream.skip(n - bytesSkipped);
+
+ if (bytesSkipped < n) {
+ currentSparseInputStreamIndex++;
+ }
+ }
+
+ return bytesSkipped;
+ }
+
+ /**
* Since we do not support marking just yet, we return false.
*
* @return False.
@@@ -470,20 -523,164 +523,164 @@@
getNextEntry(); // Get the actual file entry
}
+ /**
+ * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
+ * may appear multi times, and they look like:
+ *
+ * GNU.sparse.size=size
+ * GNU.sparse.numblocks=numblocks
+ * repeat numblocks times
+ * GNU.sparse.offset=offset
+ * GNU.sparse.numbytes=numbytes
+ * end repeat
+ *
+ *
+ * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+ *
+ * GNU.sparse.map
+ * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+ *
+ *
+ * For PAX Format 1.X:
+ * The sparse map itself is stored in the file data block, preceding the actual file data.
+ * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
+ * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
+ * giving the offset and size of the data block it describes.
+ * @throws IOException
+ */
private void paxHeaders() throws IOException{
- final Map<String, String> headers = parsePaxHeaders(this);
+ List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
+ final Map<String, String> headers = parsePaxHeaders(this, sparseHeaders);
+
+ // for 0.1 PAX Headers
+ if (headers.containsKey("GNU.sparse.map")) {
+ sparseHeaders = parsePAX01SparseHeaders(headers.get("GNU.sparse.map"));
+ }
getNextEntry(); // Get the actual file entry
- applyPaxHeadersToCurrentEntry(headers);
+ applyPaxHeadersToCurrentEntry(headers, sparseHeaders);
+
+ // for 1.0 PAX Format, the sparse map is stored in the file data block
+ if(currEntry.isPaxGNU1XSparse()) {
+ sparseHeaders = parsePAX1XSparseHeaders();
+ currEntry.setSparseHeaders(sparseHeaders);
+ }
+
+ // sparse headers are all done reading, we need to build
+ // sparse input streams using these sparse headers
+ buildSparseInputStreams();
}
- // NOTE, using a Map here makes it impossible to ever support GNU
- // sparse files using the PAX Format 0.0, see
- // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188
- Map<String, String> parsePaxHeaders(final InputStream inputStream)
+ /**
+ * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+ * GNU.sparse.map
+ * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+ *
+ * @param sparseMap the sparse map string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+ * @return sparse headers parsed from sparse map
+ * @throws IOException
+ */
+ private List<TarArchiveStructSparse> parsePAX01SparseHeaders(String sparseMap) throws IOException {
+ List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
+ String[] sparseHeaderStrings = sparseMap.split(",");
+
+ for (int i = 0; i < sparseHeaderStrings.length;i += 2) {
+ long sparseOffset = Long.parseLong(sparseHeaderStrings[i]);
+ long sparseNumbytes = Long.parseLong(sparseHeaderStrings[i + 1]);
+ sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes));
+ }
+
+ return sparseHeaders;
+ }
+
+ /**
+ * For PAX Format 1.X:
+ * The sparse map itself is stored in the file data block, preceding the actual file data.
+ * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
+ * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
+ * giving the offset and size of the data block it describes.
+ * @return sparse headers
+ * @throws IOException
+ */
+ private List<TarArchiveStructSparse> parsePAX1XSparseHeaders() throws IOException {
+ // for 1.X PAX Headers
+ List<TarArchiveStructSparse> sparseHeaders = new ArrayList<>();
+ long bytesRead = 0;
+ long[] readResult;
+ long sparseHeadersCount;
+
- readResult = readLineOfNumberForPax1X(is);
++ readResult = readLineOfNumberForPax1X(inputStream);
+ sparseHeadersCount = readResult[0];
+ bytesRead += readResult[1];
+ while (sparseHeadersCount-- > 0) {
- readResult = readLineOfNumberForPax1X(is);
++ readResult = readLineOfNumberForPax1X(inputStream);
+ long sparseOffset = readResult[0];
+ bytesRead += readResult[1];
+
- readResult = readLineOfNumberForPax1X(is);
++ readResult = readLineOfNumberForPax1X(inputStream);
+ long sparseNumbytes = readResult[0];
+ bytesRead += readResult[1];
+ sparseHeaders.add(new TarArchiveStructSparse(sparseOffset, sparseNumbytes));
+ }
+
+ // skip the rest of this record data
+ long bytesToSkip = recordSize - bytesRead % recordSize;
- IOUtils.skip(is, bytesToSkip);
++ IOUtils.skip(inputStream, bytesToSkip);
+ return sparseHeaders;
+ }
+
+ /**
+ * For 1.X PAX Format, the sparse headers are stored in the file data block, preceding the actual file data.
+ * It consists of a series of decimal numbers delimited by newlines.
+ *
+ * @param inputStream the input stream of the tar file
+ * @return the decimal number delimited by '\n', and the bytes read from input stream
+ * @throws IOException
+ */
+ private long[] readLineOfNumberForPax1X(InputStream inputStream) throws IOException {
+ int number;
+ long result = 0;
+ long bytesRead = 0;
+
+ while((number = inputStream.read()) != '\n') {
+ bytesRead += 1;
+ if(number == -1) {
+ throw new IOException("Unexpected EOF when reading parse information of 1.X PAX format");
+ }
+ result = result * 10 + (number - '0');
+ }
+ bytesRead += 1;
+
+ return new long[] {result, bytesRead};
+ }
+
+ /**
+ * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
+ * may appear multi times, and they look like:
+ *
+ * GNU.sparse.size=size
+ * GNU.sparse.numblocks=numblocks
+ * repeat numblocks times
+ * GNU.sparse.offset=offset
+ * GNU.sparse.numbytes=numbytes
+ * end repeat
+ *
+ * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+ *
+ * GNU.sparse.map
+ * Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+ *
+ * @param i inputstream to read keys and values
+ * @param sparseHeaders used in PAX Format 0.0 & 0.1, as it may appear multi times,
+ * the sparse headers need to be stored in an array, not a map
+ * @return
+ * @throws IOException
+ */
- Map<String, String> parsePaxHeaders(final InputStream i, List<TarArchiveStructSparse> sparseHeaders)
++ Map<String, String> parsePaxHeaders(final InputStream inputStream, List<TarArchiveStructSparse> sparseHeaders)
throws IOException {
final Map<String, String> headers = new HashMap<>(globalPaxHeaders);
+ TarArchiveStructSparse sparseHeader = null;
// Format is "length keyword=value\n";
-- while(true){ // get length
++ while(true) { // get length
int ch;
int len = 0;
int read = 0;
@@@ -637,9 -846,25 +846,25 @@@
throw new IllegalStateException("No current tar entry");
}
+ if (!currEntry.isSparse()) {
+ if (entryOffset >= entrySize) {
+ return -1;
+ }
+ } else {
+ // for sparse entries, there are actually currEntry.getRealSize() bytes to read
+ if (entryOffset >= currEntry.getRealSize()) {
+ return -1;
+ }
+ }
+
numToRead = Math.min(numToRead, available());
- totalRead = inputStream.read(buf, offset, numToRead);
+ if (currEntry.isSparse()) {
+ // for sparse entries, we need to read them in another way
+ totalRead = readSparse(buf, offset, numToRead);
+ } else {
- totalRead = is.read(buf, offset, numToRead);
++ totalRead = inputStream.read(buf, offset, numToRead);
+ }
if (totalRead == -1) {
if (numToRead > 0) {
@@@ -655,6 -880,61 +880,61 @@@
}
/**
+ * For sparse tar entries, there are many "holes"(consisting of all 0) in the file. Only the non-zero data is
+ * stored in tar files, and they are stored separately. The structure of non-zero data is introduced by the
+ * sparse headers using the offset, where a block of non-zero data starts, and numbytes, the length of the
+ * non-zero data block.
+ * When reading sparse entries, the actual data is read out with "holes" and non-zero data combined together
+ * according to the sparse headers.
+ *
+ * @param buf The buffer into which to place bytes read.
+ * @param offset The offset at which to place bytes read.
+ * @param numToRead The number of bytes to read.
+ * @return The number of bytes read, or -1 at EOF.
+ * @throws IOException on error
+ */
+ private int readSparse(final byte[] buf, final int offset, int numToRead) throws IOException {
+ // if there are no actual input streams, just read from the original input stream
+ if (sparseInputStreams.size() == 0) {
- return is.read(buf, offset, numToRead);
++ return inputStream.read(buf, offset, numToRead);
+ }
+
+ if(currentSparseInputStreamIndex >= sparseInputStreams.size()) {
+ return -1;
+ }
+
+ InputStream currentInputStream = sparseInputStreams.get(currentSparseInputStreamIndex);
+ int readLen = currentInputStream.read(buf, offset, numToRead);
+
+ // if the current input stream is the last input stream,
+ // just return the number of bytes read from current input stream
+ if (currentSparseInputStreamIndex == sparseInputStreams.size() - 1) {
+ return readLen;
+ }
+
+ // if EOF of current input stream is meet, open a new input stream and recursively call read
+ if (readLen == -1) {
+ currentSparseInputStreamIndex++;
+ return readSparse(buf, offset, numToRead);
+ }
+
+ // if the rest data of current input stream is not long enough, open a new input stream
+ // and recursively call read
+ if (readLen < numToRead) {
+ currentSparseInputStreamIndex++;
+ int readLenOfNext = readSparse(buf, offset + readLen, numToRead - readLen);
+ if (readLenOfNext == -1) {
+ return readLen;
+ }
+
+ return readLen + readLenOfNext;
+ }
+
+ // if the rest data of current input stream is enough(which means readLen == len), just return readLen
+ return readLen;
+ }
+
+ /**
* Whether this class is able to read the given entry.
*
* <p>May return false if the current entry is a sparse file.</p>
@@@ -745,4 -1025,85 +1025,85 @@@
signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
}
+ /**
+ * Build the input streams consisting of all-zero input streams and non-zero input streams.
+ * When reading from the non-zero input streams, the data is actually read from the original input stream.
+ * The size of each input stream is introduced by the sparse headers.
+ *
+ * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the
+ * 0 size input streams because they are meaningless.
+ */
+ private void buildSparseInputStreams() throws IOException {
+ currentSparseInputStreamIndex = -1;
+ sparseInputStreams = new ArrayList<>();
+ InputStream zeroInputStream = new TarArchiveSparseZeroInputStream();
+
+ long offset = 0;
+ List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders();
+ // sort the sparse headers in case they are written in wrong order
+ if (sparseHeaders != null && sparseHeaders.size() > 1) {
+ final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>() {
+ @Override
+ public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse q) {
+ Long pOffset = p.getOffset();
+ Long qOffset = q.getOffset();
+ return pOffset.compareTo(qOffset);
+ }
+ };
+ Collections.sort(sparseHeaders, sparseHeaderComparator);
+ }
+
+ for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
+ if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0) {
+ break;
+ }
+
+ if ((sparseHeader.getOffset() - offset) < 0) {
+ throw new IOException("Corrupted struct sparse detected");
+ }
+
+ // only store the input streams with non-zero size
+ if ((sparseHeader.getOffset() - offset) > 0) {
+ sparseInputStreams.add(new BoundedInputStream(zeroInputStream, sparseHeader.getOffset() - offset));
+ }
+
+ // only store the input streams with non-zero size
+ if (sparseHeader.getNumbytes() > 0) {
- sparseInputStreams.add(new BoundedInputStream(is, sparseHeader.getNumbytes()));
++ sparseInputStreams.add(new BoundedInputStream(inputStream, sparseHeader.getNumbytes()));
+ }
+
+ offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
+ }
+
+ if (sparseInputStreams.size() > 0) {
+ currentSparseInputStreamIndex = 0;
+ }
+ }
+
+ /**
+ * This is an inputstream that always return 0,
+ * this is used when reading the "holes" of a sparse file
+ */
+ public class TarArchiveSparseZeroInputStream extends InputStream {
+ /**
+ * Just return 0
+ * @return
+ * @throws IOException
+ */
+ @Override
+ public int read() throws IOException {
+ return 0;
+ }
+
+ /**
+ * these's nothing need to do when skipping
+ *
+ * @param n bytes to skip
+ * @return bytes actually skipped
+ */
+ @Override
+ public long skip(final long n) {
+ return n;
+ }
+ }
}