You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@commons.apache.org by GitBox <gi...@apache.org> on 2020/12/25 20:38:06 UTC

[GitHub] [commons-compress] theobisproject commented on a change in pull request #113: COMPRESS-540: Implement TarFile to allow random access to tar files

theobisproject commented on a change in pull request #113:
URL: https://github.com/apache/commons-compress/pull/113#discussion_r548909848



##########
File path: src/main/java/org/apache/commons/compress/archivers/tar/TarFile.java
##########
@@ -0,0 +1,732 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+package org.apache.commons.compress.archivers.tar;
+
+import java.io.ByteArrayOutputStream;
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.compress.archivers.zip.ZipEncoding;
+import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
+import org.apache.commons.compress.utils.ArchiveUtils;
+import org.apache.commons.compress.utils.BoundedInputStream;
+import org.apache.commons.compress.utils.BoundedNIOInputStream;
+import org.apache.commons.compress.utils.BoundedSeekableByteChannelInputStream;
+import org.apache.commons.compress.utils.SeekableInMemoryByteChannel;
+
+/**
+ * The TarFile provides random access to UNIX archives.
+ * @since 1.21
+ */
+public class TarFile implements Closeable {
+
+    private static final int SMALL_BUFFER_SIZE = 256;
+
+    private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
+
+    private final SeekableByteChannel archive;
+
+    /**
+     * The encoding of the tar file
+     */
+    private final ZipEncoding zipEncoding;
+
+    private final LinkedList<TarArchiveEntry> entries = new LinkedList<>();
+
+    private final int blockSize;
+
+    private final boolean lenient;
+
+    private final int recordSize;
+
+    private final ByteBuffer recordBuffer;
+
+    // the global sparse headers, this is only used in PAX Format 0.X
+    private final List<TarArchiveStructSparse> globalSparseHeaders = new ArrayList<>();
+
+    private boolean hasHitEOF;
+
+    /**
+     * The meta-data about the current entry
+     */
+    private TarArchiveEntry currEntry;
+
+    // the global PAX header
+    private Map<String, String> globalPaxHeaders = new HashMap<>();
+
+    private final Map<String, List<InputStream>> sparseInputStreams = new HashMap<>();
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param content the content to use
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final byte[] content) throws IOException {
+        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param content  the content to use
+     * @param encoding the encoding to use
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final byte[] content, final String encoding) throws IOException {
+        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param content the content to use
+     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
+     *                ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
+     *                exception instead.
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final byte[] content, final boolean lenient) throws IOException {
+        this(new SeekableInMemoryByteChannel(content), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archive the file of the archive to use
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final File archive) throws IOException {
+        this(archive.toPath());
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archive  the file of the archive to use
+     * @param encoding the encoding to use
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final File archive, final String encoding) throws IOException {
+        this(archive.toPath(), encoding);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archive the file of the archive to use
+     * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be
+     *                ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
+     *                exception instead.
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final File archive, final boolean lenient) throws IOException {
+        this(archive.toPath(), lenient);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archivePath the path of the archive to use
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final Path archivePath) throws IOException {
+        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, false);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archivePath the path of the archive to use
+     * @param encoding    the encoding to use
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final Path archivePath, final String encoding) throws IOException {
+        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, encoding, false);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archivePath the path of the archive to use
+     * @param lenient     when set to true illegal values for group/userid, mode, device numbers and timestamp will be
+     *                    ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
+     *                    exception instead.
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final Path archivePath, final boolean lenient) throws IOException {
+        this(Files.newByteChannel(archivePath), TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient);
+    }
+
+    /**
+     * Constructor for TarFile.
+     *
+     * @param archive    the seekable byte channel to use
+     * @param blockSize  the blocks size to use
+     * @param recordSize the record size to use
+     * @param encoding   the encoding to use
+     * @param lenient    when set to true illegal values for group/userid, mode, device numbers and timestamp will be
+     *                   ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an
+     *                   exception instead.
+     * @throws IOException when reading the tar archive fails
+     */
+    public TarFile(final SeekableByteChannel archive, final int blockSize, final int recordSize, final String encoding, final boolean lenient) throws IOException {
+        this.archive = archive;
+        this.hasHitEOF = false;
+        this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
+        this.recordSize = recordSize;
+        this.recordBuffer = ByteBuffer.allocate(this.recordSize);
+        this.blockSize = blockSize;
+        this.lenient = lenient;
+
+        TarArchiveEntry entry;
+        while ((entry = getNextTarEntry()) != null) {
+            entries.add(entry);
+        }
+    }
+
+    /**
+     * Get the next entry in this tar archive. This will skip
+     * to the end of the current entry, if there is one, and
+     * place the position of the channel at the header of the
+     * next entry, and read the header and instantiate a new
+     * TarEntry from the header bytes and return that entry.
+     * If there are no more entries in the archive, null will
+     * be returned to indicate that the end of the archive has
+     * been reached.
+     *
+     * @return The next TarEntry in the archive, or null if there is no next entry.
+     * @throws IOException when reading the next TarEntry fails
+     */
+    private TarArchiveEntry getNextTarEntry() throws IOException {
+        if (isAtEOF()) {
+            return null;
+        }
+
+        if (currEntry != null) {
+            // Skip to the end of the entry
+            archive.position(currEntry.getDataOffset() + currEntry.getSize());
+            throwExceptionIfPositionIsNotInArchive();
+
+            skipRecordPadding();
+        }
+
+        final ByteBuffer headerBuf = getRecord();
+        if (null == headerBuf) {
+            /* hit EOF */
+            currEntry = null;
+            return null;
+        }
+
+        try {
+            currEntry = new TarArchiveEntry(headerBuf.array(), zipEncoding, lenient, archive.position());
+        } catch (final IllegalArgumentException e) {
+            throw new IOException("Error detected parsing the header", e);
+        }
+
+        if (currEntry.isGNULongLinkEntry()) {
+            final byte[] longLinkData = getLongNameData();
+            if (longLinkData == null) {
+                // Bugzilla: 40334
+                // Malformed tar file - long link entry name not followed by
+                // entry
+                return null;
+            }
+            currEntry.setLinkName(zipEncoding.decode(longLinkData));
+        }
+
+        if (currEntry.isGNULongNameEntry()) {
+            final byte[] longNameData = getLongNameData();
+            if (longNameData == null) {
+                // Bugzilla: 40334
+                // Malformed tar file - long entry name not followed by
+                // entry
+                return null;
+            }
+
+            // COMPRESS-509 : the name of directories should end with '/'
+            final String name = zipEncoding.decode(longNameData);
+            currEntry.setName(name);
+            if (currEntry.isDirectory() && !name.endsWith("/")) {
+                currEntry.setName(name + "/");
+            }
+        }
+
+        if (currEntry.isGlobalPaxHeader()) { // Process Global Pax headers
+            readGlobalPaxHeaders();
+        }
+
+        try {
+            if (currEntry.isPaxHeader()) { // Process Pax headers
+                paxHeaders();
+            } else if (!globalPaxHeaders.isEmpty()) {
+                applyPaxHeadersToCurrentEntry(globalPaxHeaders, globalSparseHeaders);
+            }
+        } catch (NumberFormatException e) {
+            throw new IOException("Error detected parsing the pax header", e);
+        }
+
+        if (currEntry.isOldGNUSparse()) { // Process sparse files
+            readOldGNUSparse();
+        }
+
+        return currEntry;
+    }
+
+    /**
+     * Adds the sparse chunks from the current entry to the sparse chunks,
+     * including any additional sparse entries following the current entry.
+     *
+     * @throws IOException when reading the sparse entry fails
+     */
+    private void readOldGNUSparse() throws IOException {
+        if (currEntry.isExtended()) {
+            TarArchiveSparseEntry entry;
+            do {
+                final ByteBuffer headerBuf = getRecord();
+                if (headerBuf == null) {
+                    currEntry = null;
+                    break;
+                }
+                entry = new TarArchiveSparseEntry(headerBuf.array());
+                currEntry.getSparseHeaders().addAll(entry.getSparseHeaders());
+                currEntry.setDataOffset(currEntry.getDataOffset() + recordSize);
+            } while (entry.isExtended());
+        }
+
+        // sparse headers are all done reading, we need to build
+        // sparse input streams using these sparse headers
+        buildSparseInputStreams();
+    }
+
+    /**
+     * Build the input streams consisting of all-zero input streams and non-zero input streams.
+     * When reading from the non-zero input streams, the data is actually read from the original input stream.
+     * The size of each input stream is introduced by the sparse headers.
+     *
+     * NOTE : Some all-zero input streams and non-zero input streams have the size of 0. We DO NOT store the
+     *        0 size input streams because they are meaningless.
+     */
+    private void buildSparseInputStreams() throws IOException {
+        List<InputStream> streams = new ArrayList<>();
+
+        final List<TarArchiveStructSparse> sparseHeaders = currEntry.getSparseHeaders();
+        // sort the sparse headers in case they are written in wrong order
+        if (sparseHeaders != null && sparseHeaders.size() > 1) {
+            final Comparator<TarArchiveStructSparse> sparseHeaderComparator = new Comparator<TarArchiveStructSparse>() {
+                @Override
+                public int compare(final TarArchiveStructSparse p, final TarArchiveStructSparse q) {
+                    Long pOffset = p.getOffset();
+                    Long qOffset = q.getOffset();
+                    return pOffset.compareTo(qOffset);
+                }
+            };
+            Collections.sort(sparseHeaders, sparseHeaderComparator);
+        }
+
+        if (sparseHeaders != null) {
+            // Stream doesn't need to be closed at all as it doesn't use any resources
+            final InputStream zeroInputStream = new TarArchiveSparseZeroInputStream(); //NOSONAR
+            long offset = 0;
+            long numberOfZeroBytesInSparseEntry = 0;
+            for (TarArchiveStructSparse sparseHeader : sparseHeaders) {
+                if (sparseHeader.getOffset() == 0 && sparseHeader.getNumbytes() == 0) {
+                    break;
+                }
+
+                if ((sparseHeader.getOffset() - offset) < 0) {
+                    throw new IOException("Corrupted struct sparse detected");
+                }
+
+                // only store the input streams with non-zero size
+                if ((sparseHeader.getOffset() - offset) > 0) {
+                    long sizeOfZeroByteStream = sparseHeader.getOffset() - offset;
+                    streams.add(new BoundedInputStream(zeroInputStream, sizeOfZeroByteStream));
+                    numberOfZeroBytesInSparseEntry += sizeOfZeroByteStream;
+                }
+
+                // only store the input streams with non-zero size
+                if (sparseHeader.getNumbytes() > 0) {
+                    long start =
+                            currEntry.getDataOffset() + sparseHeader.getOffset() - numberOfZeroBytesInSparseEntry;
+                    streams.add(new BoundedSeekableByteChannelInputStream(start, sparseHeader.getNumbytes(), archive));
+                }
+
+                offset = sparseHeader.getOffset() + sparseHeader.getNumbytes();
+            }
+        }
+
+        sparseInputStreams.put(currEntry.getName(), streams);
+    }
+
+    /**
+     * Update the current entry with the read pax headers
+     * @param headers Headers read from the pax header
+     * @param sparseHeaders Sparse headers read from pax header
+     */
+    private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers, final List<TarArchiveStructSparse> sparseHeaders) {
+        currEntry.updateEntryFromPaxHeaders(headers);
+        currEntry.setSparseHeaders(sparseHeaders);
+    }
+
+    /**
+     * For PAX Format 0.0, the sparse headers(GNU.sparse.offset and GNU.sparse.numbytes)
+     * may appear multi times, and they look like:
+     *
+     * GNU.sparse.size=size
+     * GNU.sparse.numblocks=numblocks
+     * repeat numblocks times
+     *   GNU.sparse.offset=offset
+     *   GNU.sparse.numbytes=numbytes
+     * end repeat
+     *
+     *
+     * For PAX Format 0.1, the sparse headers are stored in a single variable : GNU.sparse.map
+     *
+     * GNU.sparse.map
+     *    Map of non-null data chunks. It is a string consisting of comma-separated values "offset,size[,offset-1,size-1...]"
+     *
+     *
+     * For PAX Format 1.X:
+     * The sparse map itself is stored in the file data block, preceding the actual file data.
+     * It consists of a series of decimal numbers delimited by newlines. The map is padded with nulls to the nearest block boundary.
+     * The first number gives the number of entries in the map. Following are map entries, each one consisting of two numbers
+     * giving the offset and size of the data block it describes.

Review comment:
       I formatted the Javadoc with html tags. Should be easier to read now when it is rendered




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org