You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by bo...@apache.org on 2019/12/15 11:10:51 UTC

[commons-compress] 01/05: COMPRESS-477 support for splitted zip files

This is an automated email from the ASF dual-hosted git repository.

bodewig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-compress.git

commit be2d130862163cfa8597744140830c3f7bb3a5ed
Author: Lee <pe...@gmail.com>
AuthorDate: Thu Nov 7 14:48:03 2019 +0800

    COMPRESS-477 support for splitted zip files
    
    add support for constructing splitted zip files
---
 .../compress/archivers/zip/ZipArchiveEntry.java    |  10 +-
 .../archivers/zip/ZipArchiveOutputStream.java      | 168 ++++++++++++++--
 .../archivers/zip/ZipSplitOutputStream.java        | 219 +++++++++++++++++++++
 .../commons/compress/compressors/FileNameUtil.java |  15 ++
 4 files changed, 393 insertions(+), 19 deletions(-)

diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
index 30f8479..bcf3459 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
@@ -144,7 +144,7 @@ public class ZipArchiveEntry extends java.util.zip.ZipEntry
     private boolean isStreamContiguous = false;
     private NameSource nameSource = NameSource.NAME;
     private CommentSource commentSource = CommentSource.COMMENT;
-
+    private int diskNumberStart;
 
     /**
      * Creates a new zip entry with the specified name.
@@ -1080,6 +1080,14 @@ public class ZipArchiveEntry extends java.util.zip.ZipEntry
         this.commentSource = commentSource;
     }
 
+    public int getDiskNumberStart() {
+        return diskNumberStart;
+    }
+
+    public void setDiskNumberStart(int diskNumberStart) {
+        this.diskNumberStart = diskNumberStart;
+    }
+
     private ZipExtraField[] copyOf(final ZipExtraField[] src, final int length) {
         final ZipExtraField[] cpy = new ZipExtraField[length];
         System.arraycopy(src, 0, cpy, 0, Math.min(src.length, length));
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java
index c40aab2..962e60b 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java
@@ -190,6 +190,16 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
     private long cdLength = 0;
 
     /**
+     * Disk number start of central directory.
+     */
+    private long cdDiskNumberStart = 0;
+
+    /**
+     * Length of end of central directory
+     */
+    private long eocdLength = 0;
+
+    /**
      * Helper, a 0 as ZipShort.
      */
     private static final byte[] ZERO = {0, 0};
@@ -267,6 +277,17 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
     private final Calendar calendarInstance = Calendar.getInstance();
 
     /**
+     * Whether we are creating a split zip
+     */
+    private boolean isSplitZip = false;
+
+    /**
+     * Holds the number of Central Directories on each disk, this is used
+     * when writing Zip64 End Of Central Directory and End Of Central Directory
+     */
+    private final Map<Integer, Integer> numberOfCDInDiskData = new HashMap<>();
+
+    /**
      * Creates a new ZIP OutputStream filtering the underlying stream.
      * @param out the outputstream to zip
      */
@@ -306,6 +327,14 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
         streamCompressor = _streamCompressor;
     }
 
+    public ZipArchiveOutputStream(final File file, final long zipSplitSize) throws IOException {
+        def = new Deflater(level, true);
+        this.out = new ZipSplitOutputStream(file, zipSplitSize);
+        streamCompressor = StreamCompressor.create(this.out, def);
+        isSplitZip = true;
+        channel = null;
+    }
+
     /**
      * Creates a new ZIP OutputStream writing to a SeekableByteChannel.
      *
@@ -467,15 +496,41 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
             throw new IOException("This archive contains unclosed entries.");
         }
 
-        cdOffset = streamCompressor.getTotalBytesWritten();
+        long cdOverallOffset = streamCompressor.getTotalBytesWritten();
+        cdOffset = cdOverallOffset;
+        if (isSplitZip) {
+            // when creating a split zip, the offset should be
+            // the offset to the corresponding segment disk
+            ZipSplitOutputStream zipSplitOutputStream = (ZipSplitOutputStream)this.out;
+            cdOffset = zipSplitOutputStream.getCurrentSplitSegmentBytesWritten();
+            cdDiskNumberStart = zipSplitOutputStream.getCurrentSplitSegmentIndex();
+        }
         writeCentralDirectoryInChunks();
 
-        cdLength = streamCompressor.getTotalBytesWritten() - cdOffset;
+        cdLength = streamCompressor.getTotalBytesWritten() - cdOverallOffset;
+
+        // calculate the length of end of central directory, as it may be used in writeZip64CentralDirectory
+        final ByteBuffer commentData = this.zipEncoding.encode(comment);
+        final int commentLength = commentData.limit() - commentData.position();
+        eocdLength = WORD /* length of EOCD_SIG */
+                + SHORT /* number of this disk */
+                + SHORT /* disk number of start of central directory */
+                + SHORT /* total number of entries on this disk */
+                + SHORT /* total number of entries */
+                + WORD  /* size of central directory */
+                + WORD  /* offset of start of central directory */
+                + SHORT /* zip comment length */
+                + commentLength /* zip comment */;
+
         writeZip64CentralDirectory();
         writeCentralDirectoryEnd();
         metaData.clear();
         entries.clear();
         streamCompressor.close();
+        if (isSplitZip) {
+            // trigger the ZipSplitOutputStream to write the final split segment
+            out.close();
+        }
         finished = true;
     }
 
@@ -1036,7 +1091,15 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
             addUnicodeExtraFields(ze, encodable, name);
         }
 
-        final long localHeaderStart = streamCompressor.getTotalBytesWritten();
+        long localHeaderStart = streamCompressor.getTotalBytesWritten();
+        if (isSplitZip) {
+            // when creating a split zip, the offset should be
+            // the offset to the corresponding segment disk
+            ZipSplitOutputStream splitOutputStream = (ZipSplitOutputStream)this.out;
+            ze.setDiskNumberStart(splitOutputStream.getCurrentSplitSegmentIndex());
+            localHeaderStart = splitOutputStream.getCurrentSplitSegmentBytesWritten();
+        }
+
         final byte[] localHeader = createLocalFileHeader(ze, name, encodable, phased, localHeaderStart);
         metaData.put(ze, new EntryMetaData(localHeaderStart, usesDataDescriptor(ze.getMethod(), phased)));
         entry.localDataStart = localHeaderStart + LFH_CRC_OFFSET; // At crc offset
@@ -1235,6 +1298,16 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
     private byte[] createCentralFileHeader(final ZipArchiveEntry ze, final ByteBuffer name,
                                            final EntryMetaData entryMetaData,
                                            final boolean needsZip64Extra) throws IOException {
+        if(isSplitZip) {
+            int currentSplitSegment = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex();
+            if(numberOfCDInDiskData.get(currentSplitSegment) == null) {
+                numberOfCDInDiskData.put(currentSplitSegment, 1);
+            } else {
+                int originalNumberOfCD = numberOfCDInDiskData.get(currentSplitSegment);
+                numberOfCDInDiskData.put(currentSplitSegment, originalNumberOfCD + 1);
+            }
+        }
+
         final byte[] extra = ze.getCentralDirectoryExtra();
 
         // file comment length
@@ -1291,7 +1364,11 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
         putShort(commentLen, buf, CFH_COMMENT_LENGTH_OFFSET);
 
         // disk number start
-        System.arraycopy(ZERO, 0, buf, CFH_DISK_NUMBER_OFFSET, SHORT);
+        if(isSplitZip) {
+            putShort(ze.getDiskNumberStart(), buf, CFH_DISK_NUMBER_OFFSET);
+        } else {
+            System.arraycopy(ZERO, 0, buf, CFH_DISK_NUMBER_OFFSET, SHORT);
+        }
 
         // internal file attributes
         putShort(ze.getInternalAttributes(), buf, CFH_INTERNAL_ATTRIBUTES_OFFSET);
@@ -1352,11 +1429,21 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
      * and {@link Zip64Mode #setUseZip64} is {@link Zip64Mode#Never}.
      */
     protected void writeCentralDirectoryEnd() throws IOException {
+        if(!hasUsedZip64) {
+            ((ZipSplitOutputStream)this.out).prepareToWriteUnsplittableContent(eocdLength);
+        }
+
         writeCounted(EOCD_SIG);
 
-        // disk numbers
-        writeCounted(ZERO);
-        writeCounted(ZERO);
+        // number of this disk
+        int numberOfThisDisk = 0;
+        if(isSplitZip) {
+            numberOfThisDisk = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex();
+        }
+        writeCounted(ZipShort.getBytes(numberOfThisDisk));
+
+        // disk number of the start of central directory
+        writeCounted(ZipShort.getBytes((int)cdDiskNumberStart));
 
         // number of entries
         final int numberOfEntries = entries.size();
@@ -1370,9 +1457,15 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
                                              .ARCHIVE_TOO_BIG_MESSAGE);
         }
 
+        // total number of entries in the central directory on this disk
+        int numOfEntriesOnThisDisk = numberOfCDInDiskData.get(numberOfThisDisk) == null ? 0 : numberOfCDInDiskData.get(numberOfThisDisk);
+        final byte[] numOfEntriesOnThisDiskData = ZipShort
+                .getBytes(Math.min(numOfEntriesOnThisDisk, ZIP64_MAGIC_SHORT));
+        writeCounted(numOfEntriesOnThisDiskData);
+
+        // number of entries
         final byte[] num = ZipShort.getBytes(Math.min(numberOfEntries,
-                                                ZIP64_MAGIC_SHORT));
-        writeCounted(num);
+                ZIP64_MAGIC_SHORT));
         writeCounted(num);
 
         // length and location of CD
@@ -1408,11 +1501,20 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
             return;
         }
 
-        final long offset = streamCompressor.getTotalBytesWritten();
+        long offset = streamCompressor.getTotalBytesWritten();
+        long diskNumberStart = 0L;
+        if(isSplitZip) {
+            // when creating a split zip, the offset of should be
+            // the offset to the corresponding segment disk
+            ZipSplitOutputStream zipSplitOutputStream = (ZipSplitOutputStream)this.out;
+            offset = zipSplitOutputStream.getCurrentSplitSegmentBytesWritten();
+            diskNumberStart = zipSplitOutputStream.getCurrentSplitSegmentIndex();
+        }
+
 
         writeOut(ZIP64_EOCD_SIG);
-        // size, we don't have any variable length as we don't support
-        // the extensible data sector, yet
+        // size of zip64 end of central directory, we don't have any variable length
+        // as we don't support the extensible data sector, yet
         writeOut(ZipEightByteInteger
                  .getBytes(SHORT   /* version made by */
                            + SHORT /* version needed to extract */
@@ -1428,14 +1530,25 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
         writeOut(ZipShort.getBytes(ZIP64_MIN_VERSION));
         writeOut(ZipShort.getBytes(ZIP64_MIN_VERSION));
 
-        // disk numbers - four bytes this time
-        writeOut(LZERO);
-        writeOut(LZERO);
+        // number of this disk
+        long numberOfThisDisk = 0;
+        if (isSplitZip) {
+            numberOfThisDisk = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex();
+        }
+        writeOut(ZipLong.getBytes(numberOfThisDisk));
+
+        // disk number of the start of central directory
+        writeOut(ZipLong.getBytes(cdDiskNumberStart));
+
+        // total number of entries in the central directory on this disk
+        int numOfEntriesOnThisDisk = numberOfCDInDiskData.get(numberOfThisDisk) == null ? 0 : numberOfCDInDiskData.get(numberOfThisDisk);
+        final byte[] numOfEntriesOnThisDiskData = ZipEightByteInteger
+                .getBytes(Math.min(numOfEntriesOnThisDisk, ZIP64_MAGIC_SHORT));
+        writeOut(numOfEntriesOnThisDiskData);
 
         // number of entries
         final byte[] num = ZipEightByteInteger.getBytes(entries.size());
         writeOut(num);
-        writeOut(num);
 
         // length and location of CD
         writeOut(ZipEightByteInteger.getBytes(cdLength));
@@ -1443,15 +1556,34 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
 
         // no "zip64 extensible data sector" for now
 
+        if(isSplitZip) {
+            // based on the zip specification, the End Of Central Directory record and
+            // the Zip64 End Of Central Directory locator record must be on the same segment
+            long zip64EOCDLOCLength = WORD  /* length of ZIP64_EOCD_LOC_SIG */
+                    + WORD  /* disk number of ZIP64_EOCD_SIG */
+                    + DWORD /* offset of ZIP64_EOCD_SIG */
+                    + WORD  /* total number of disks */;
+
+            long unsplittableContentSize = zip64EOCDLOCLength + eocdLength;
+            ((ZipSplitOutputStream)this.out).prepareToWriteUnsplittableContent(unsplittableContentSize);
+        }
+
         // and now the "ZIP64 end of central directory locator"
         writeOut(ZIP64_EOCD_LOC_SIG);
 
         // disk number holding the ZIP64 EOCD record
-        writeOut(LZERO);
+        writeOut(ZipLong.getBytes(diskNumberStart));
         // relative offset of ZIP64 EOCD record
         writeOut(ZipEightByteInteger.getBytes(offset));
         // total number of disks
-        writeOut(ONE);
+        if(isSplitZip) {
+            // the Zip64 End Of Central Directory Locator and the End Of Central Directory must be
+            // in the same split disk, it means they must be located in the last disk
+            long totalNumberOfDisks = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex() + 1;
+            writeOut(ZipLong.getBytes(totalNumberOfDisks));
+        } else {
+            writeOut(ONE);
+        }
     }
 
     /**
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipSplitOutputStream.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipSplitOutputStream.java
new file mode 100644
index 0000000..80a896d
--- /dev/null
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipSplitOutputStream.java
@@ -0,0 +1,219 @@
+package org.apache.commons.compress.archivers.zip;
+
+import org.apache.commons.compress.compressors.FileNameUtil;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+
+public class ZipSplitOutputStream extends OutputStream {
+    private OutputStream outputStream;
+    private File zipFile;
+    private long splitSize;
+    private int currentSplitSegmentIndex = 0;
+    private long currentSplitSegmentBytesWritten = 0;
+    private boolean finished = false;
+
+    /**
+     * 8.5.1 Capacities for split archives are as follows:
+     * <p>
+     * Maximum number of segments = 4,294,967,295 - 1
+     * Maximum .ZIP segment size = 4,294,967,295 bytes (refer to section 8.5.6)
+     * Minimum segment size = 64K
+     * Maximum PKSFX segment size = 2,147,483,647 bytes
+     */
+    private final long ZIP_SEGMENT_MIN_SIZE = 64 * 1024L;
+    private final long ZIP_SEGMENT_MAX_SIZE = 4294967295L;
+
+    /**
+     * Create a split zip. If the zip file is smaller than the split size,
+     * then there will only be one split zip, and its suffix is .zip,
+     * otherwise the split segments should be like .z01, .z02, ... .z(N-1), .zip
+     *
+     * @param zipFile   the zip file to write to
+     * @param splitSize the split size
+     */
+    public ZipSplitOutputStream(final File zipFile, final long splitSize) throws IllegalArgumentException, IOException {
+        if (splitSize < ZIP_SEGMENT_MIN_SIZE || splitSize > ZIP_SEGMENT_MAX_SIZE) {
+            throw new IllegalArgumentException("zip split segment size should between 64K and 4,294,967,295");
+        }
+
+        this.zipFile = zipFile;
+        this.splitSize = splitSize;
+
+        this.outputStream = new FileOutputStream(zipFile);
+        // write the zip split signature 0x08074B50 to the zip file
+        writeZipSplitSignature();
+    }
+
+    /**
+     * Some data can not be written to different split segments, for example:
+     * <p>
+     * 4.4.1.5  The end of central directory record and the Zip64 end
+     * of central directory locator record MUST reside on the same
+     * disk when splitting or spanning an archive.
+     *
+     * @param unsplittableContentSize
+     * @throws IllegalArgumentException
+     * @throws IOException
+     */
+    public void prepareToWriteUnsplittableContent(long unsplittableContentSize) throws IllegalArgumentException, IOException {
+        if (unsplittableContentSize > this.splitSize) {
+            throw new IllegalArgumentException("The unsplittable content size is bigger than the split segment size");
+        }
+
+        long bytesRemainingInThisSegment = this.splitSize - this.currentSplitSegmentBytesWritten;
+        if (bytesRemainingInThisSegment < unsplittableContentSize) {
+            openNewSplitSegment();
+        }
+    }
+
+    @Override
+    public void write(int i) throws IOException {
+        byte[] b = ByteBuffer.allocate(4).putInt(i).array();
+        write(b);
+    }
+
+    @Override
+    public void write(byte[] b) throws IOException {
+        write(b, 0, b.length);
+    }
+
+    /**
+     * Write the data to zip split segments, if the remaining space of current split segment
+     * is not enough, then a new split segment should be created
+     *
+     * @param b   data to write
+     * @param off offset of the start of data in param b
+     * @param len the length of data to write
+     * @throws IOException
+     */
+    @Override
+    public void write(byte[] b, int off, int len) throws IOException {
+        if (len <= 0) {
+            return;
+        }
+
+        if (currentSplitSegmentBytesWritten >= splitSize) {
+            openNewSplitSegment();
+            write(b, off, len);
+        } else if (currentSplitSegmentBytesWritten + len > splitSize) {
+            int bytesToWriteForThisSegment = (int) splitSize - (int) currentSplitSegmentBytesWritten;
+            write(b, off, bytesToWriteForThisSegment);
+            openNewSplitSegment();
+            write(b, off + bytesToWriteForThisSegment, len - bytesToWriteForThisSegment);
+        } else {
+            outputStream.write(b, off, len);
+            currentSplitSegmentBytesWritten += len;
+        }
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (!finished) {
+            finish();
+        }
+    }
+
+    /**
+     * The last zip split segment's suffix should be .zip
+     *
+     * @throws IOException
+     */
+    private void finish() throws IOException {
+        if (finished) {
+            throw new IOException("This archive has already been finished");
+        }
+
+        String zipFileBaseName = FileNameUtil.getBaseName(zipFile.getName());
+        File lastZipSplitSegmentFile = new File(zipFile.getParentFile(), zipFileBaseName + ".zip");
+        outputStream.close();
+        zipFile.renameTo(lastZipSplitSegmentFile);
+        finished = true;
+    }
+
+    /**
+     * Create a new zip split segment and prepare to write to the new segment
+     *
+     * @return
+     * @throws IOException
+     */
+    private OutputStream openNewSplitSegment() throws IOException {
+        File newFile;
+        if (currentSplitSegmentIndex == 0) {
+            outputStream.close();
+            newFile = createNewSplitSegmentFile(1);
+            zipFile.renameTo(newFile);
+        }
+
+        newFile = createNewSplitSegmentFile(null);
+
+
+        OutputStream newFileOutputStream = new FileOutputStream(newFile);
+        outputStream.close();
+        outputStream = newFileOutputStream;
+        currentSplitSegmentBytesWritten = 0;
+        zipFile = newFile;
+        currentSplitSegmentIndex++;
+
+        return newFileOutputStream;
+    }
+
+    /**
+     * Write the zip split signature (0x08074B50) to the head of the first zip split segment
+     *
+     * @throws IOException
+     */
+    private void writeZipSplitSignature() throws IOException {
+        outputStream.write(ZipArchiveOutputStream.DD_SIG);
+        currentSplitSegmentBytesWritten += ZipArchiveOutputStream.DD_SIG.length;
+    }
+
+    /**
+     * Create the new zip split segment, the last zip segment should be .zip, and the zip split segments' suffix should be
+     * like .z01, .z02, .z03, ... .z99, .z100, ..., .z(N-1), .zip
+     * <p>
+     * 8.3.3 Split ZIP files are typically written to the same location
+     * and are subject to name collisions if the spanned name
+     * format is used since each segment will reside on the same
+     * drive. To avoid name collisions, split archives are named
+     * as follows.
+     * <p>
+     * Segment 1   = filename.z01
+     * Segment n-1 = filename.z(n-1)
+     * Segment n   = filename.zip
+     * <p>
+     * NOTE:
+     * The zip split segment begin from 1,2,3,... , and we're creating a new segment,
+     * so the new segment suffix should be (currentSplitSegmentIndex + 2)
+     *
+     * @param zipSplitSegmentSuffixIndex
+     * @return
+     * @throws IOException
+     */
+    private File createNewSplitSegmentFile(Integer zipSplitSegmentSuffixIndex) throws IOException {
+        int newZipSplitSegmentSuffixIndex = zipSplitSegmentSuffixIndex == null ? (currentSplitSegmentIndex + 2) : zipSplitSegmentSuffixIndex;
+        String baseName = FileNameUtil.getBaseName(zipFile.getName());
+        String extension = ".z";
+        if (newZipSplitSegmentSuffixIndex <= 9) {
+            extension += "0" + newZipSplitSegmentSuffixIndex;
+        } else {
+            extension += newZipSplitSegmentSuffixIndex;
+        }
+
+        String newFileName = zipFile.getParent() + File.separatorChar + baseName + extension;
+        File newFile = new File(newFileName);
+
+        if (newFile.exists()) {
+            throw new IOException("split zip segment " + newFileName + " already exists");
+        }
+        return newFile;
+    }
+
+    public int getCurrentSplitSegmentIndex() {
+        return currentSplitSegmentIndex;
+    }
+
+    public long getCurrentSplitSegmentBytesWritten() {
+        return currentSplitSegmentBytesWritten;
+    }
+}
diff --git a/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java b/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java
index cc69031..570b12e 100644
--- a/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java
+++ b/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java
@@ -193,4 +193,19 @@ public class FileNameUtil {
         return fileName + defaultExtension;
     }
 
+    public static String getBaseName(String filename) {
+        if (filename == null) {
+            return null;
+        }
+
+        int lastSeparatorPosition = Math.max(filename.lastIndexOf('/'), filename.lastIndexOf('\\'));
+        String name = filename.substring(lastSeparatorPosition + 1);
+
+        int extensionPosition = name.lastIndexOf('.');
+        if(extensionPosition < 0) {
+            return name;
+        }
+
+        return name.substring(0, extensionPosition);
+    }
 }