You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by bo...@apache.org on 2019/12/15 11:10:51 UTC
[commons-compress] 01/05: COMPRESS-477 support for splitted zip
files
This is an automated email from the ASF dual-hosted git repository.
bodewig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-compress.git
commit be2d130862163cfa8597744140830c3f7bb3a5ed
Author: Lee <pe...@gmail.com>
AuthorDate: Thu Nov 7 14:48:03 2019 +0800
COMPRESS-477 support for splitted zip files
add support for constructing splitted zip files
---
.../compress/archivers/zip/ZipArchiveEntry.java | 10 +-
.../archivers/zip/ZipArchiveOutputStream.java | 168 ++++++++++++++--
.../archivers/zip/ZipSplitOutputStream.java | 219 +++++++++++++++++++++
.../commons/compress/compressors/FileNameUtil.java | 15 ++
4 files changed, 393 insertions(+), 19 deletions(-)
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
index 30f8479..bcf3459 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveEntry.java
@@ -144,7 +144,7 @@ public class ZipArchiveEntry extends java.util.zip.ZipEntry
private boolean isStreamContiguous = false;
private NameSource nameSource = NameSource.NAME;
private CommentSource commentSource = CommentSource.COMMENT;
-
+ private int diskNumberStart;
/**
* Creates a new zip entry with the specified name.
@@ -1080,6 +1080,14 @@ public class ZipArchiveEntry extends java.util.zip.ZipEntry
this.commentSource = commentSource;
}
+ public int getDiskNumberStart() {
+ return diskNumberStart;
+ }
+
+ public void setDiskNumberStart(int diskNumberStart) {
+ this.diskNumberStart = diskNumberStart;
+ }
+
private ZipExtraField[] copyOf(final ZipExtraField[] src, final int length) {
final ZipExtraField[] cpy = new ZipExtraField[length];
System.arraycopy(src, 0, cpy, 0, Math.min(src.length, length));
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java
index c40aab2..962e60b 100644
--- a/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipArchiveOutputStream.java
@@ -190,6 +190,16 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
private long cdLength = 0;
/**
+ * Disk number start of central directory.
+ */
+ private long cdDiskNumberStart = 0;
+
+ /**
+ * Length of end of central directory
+ */
+ private long eocdLength = 0;
+
+ /**
* Helper, a 0 as ZipShort.
*/
private static final byte[] ZERO = {0, 0};
@@ -267,6 +277,17 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
private final Calendar calendarInstance = Calendar.getInstance();
/**
+ * Whether we are creating a split zip
+ */
+ private boolean isSplitZip = false;
+
+ /**
+ * Holds the number of Central Directories on each disk, this is used
+ * when writing Zip64 End Of Central Directory and End Of Central Directory
+ */
+ private final Map<Integer, Integer> numberOfCDInDiskData = new HashMap<>();
+
+ /**
* Creates a new ZIP OutputStream filtering the underlying stream.
* @param out the outputstream to zip
*/
@@ -306,6 +327,14 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
streamCompressor = _streamCompressor;
}
+ public ZipArchiveOutputStream(final File file, final long zipSplitSize) throws IOException {
+ def = new Deflater(level, true);
+ this.out = new ZipSplitOutputStream(file, zipSplitSize);
+ streamCompressor = StreamCompressor.create(this.out, def);
+ isSplitZip = true;
+ channel = null;
+ }
+
/**
* Creates a new ZIP OutputStream writing to a SeekableByteChannel.
*
@@ -467,15 +496,41 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
throw new IOException("This archive contains unclosed entries.");
}
- cdOffset = streamCompressor.getTotalBytesWritten();
+ long cdOverallOffset = streamCompressor.getTotalBytesWritten();
+ cdOffset = cdOverallOffset;
+ if (isSplitZip) {
+ // when creating a split zip, the offset should be
+ // the offset to the corresponding segment disk
+ ZipSplitOutputStream zipSplitOutputStream = (ZipSplitOutputStream)this.out;
+ cdOffset = zipSplitOutputStream.getCurrentSplitSegmentBytesWritten();
+ cdDiskNumberStart = zipSplitOutputStream.getCurrentSplitSegmentIndex();
+ }
writeCentralDirectoryInChunks();
- cdLength = streamCompressor.getTotalBytesWritten() - cdOffset;
+ cdLength = streamCompressor.getTotalBytesWritten() - cdOverallOffset;
+
+ // calculate the length of end of central directory, as it may be used in writeZip64CentralDirectory
+ final ByteBuffer commentData = this.zipEncoding.encode(comment);
+ final int commentLength = commentData.limit() - commentData.position();
+ eocdLength = WORD /* length of EOCD_SIG */
+ + SHORT /* number of this disk */
+ + SHORT /* disk number of start of central directory */
+ + SHORT /* total number of entries on this disk */
+ + SHORT /* total number of entries */
+ + WORD /* size of central directory */
+ + WORD /* offset of start of central directory */
+ + SHORT /* zip comment length */
+ + commentLength /* zip comment */;
+
writeZip64CentralDirectory();
writeCentralDirectoryEnd();
metaData.clear();
entries.clear();
streamCompressor.close();
+ if (isSplitZip) {
+ // trigger the ZipSplitOutputStream to write the final split segment
+ out.close();
+ }
finished = true;
}
@@ -1036,7 +1091,15 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
addUnicodeExtraFields(ze, encodable, name);
}
- final long localHeaderStart = streamCompressor.getTotalBytesWritten();
+ long localHeaderStart = streamCompressor.getTotalBytesWritten();
+ if (isSplitZip) {
+ // when creating a split zip, the offset should be
+ // the offset to the corresponding segment disk
+ ZipSplitOutputStream splitOutputStream = (ZipSplitOutputStream)this.out;
+ ze.setDiskNumberStart(splitOutputStream.getCurrentSplitSegmentIndex());
+ localHeaderStart = splitOutputStream.getCurrentSplitSegmentBytesWritten();
+ }
+
final byte[] localHeader = createLocalFileHeader(ze, name, encodable, phased, localHeaderStart);
metaData.put(ze, new EntryMetaData(localHeaderStart, usesDataDescriptor(ze.getMethod(), phased)));
entry.localDataStart = localHeaderStart + LFH_CRC_OFFSET; // At crc offset
@@ -1235,6 +1298,16 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
private byte[] createCentralFileHeader(final ZipArchiveEntry ze, final ByteBuffer name,
final EntryMetaData entryMetaData,
final boolean needsZip64Extra) throws IOException {
+ if(isSplitZip) {
+ int currentSplitSegment = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex();
+ if(numberOfCDInDiskData.get(currentSplitSegment) == null) {
+ numberOfCDInDiskData.put(currentSplitSegment, 1);
+ } else {
+ int originalNumberOfCD = numberOfCDInDiskData.get(currentSplitSegment);
+ numberOfCDInDiskData.put(currentSplitSegment, originalNumberOfCD + 1);
+ }
+ }
+
final byte[] extra = ze.getCentralDirectoryExtra();
// file comment length
@@ -1291,7 +1364,11 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
putShort(commentLen, buf, CFH_COMMENT_LENGTH_OFFSET);
// disk number start
- System.arraycopy(ZERO, 0, buf, CFH_DISK_NUMBER_OFFSET, SHORT);
+ if(isSplitZip) {
+ putShort(ze.getDiskNumberStart(), buf, CFH_DISK_NUMBER_OFFSET);
+ } else {
+ System.arraycopy(ZERO, 0, buf, CFH_DISK_NUMBER_OFFSET, SHORT);
+ }
// internal file attributes
putShort(ze.getInternalAttributes(), buf, CFH_INTERNAL_ATTRIBUTES_OFFSET);
@@ -1352,11 +1429,21 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
* and {@link Zip64Mode #setUseZip64} is {@link Zip64Mode#Never}.
*/
protected void writeCentralDirectoryEnd() throws IOException {
+ if(!hasUsedZip64) {
+ ((ZipSplitOutputStream)this.out).prepareToWriteUnsplittableContent(eocdLength);
+ }
+
writeCounted(EOCD_SIG);
- // disk numbers
- writeCounted(ZERO);
- writeCounted(ZERO);
+ // number of this disk
+ int numberOfThisDisk = 0;
+ if(isSplitZip) {
+ numberOfThisDisk = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex();
+ }
+ writeCounted(ZipShort.getBytes(numberOfThisDisk));
+
+ // disk number of the start of central directory
+ writeCounted(ZipShort.getBytes((int)cdDiskNumberStart));
// number of entries
final int numberOfEntries = entries.size();
@@ -1370,9 +1457,15 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
.ARCHIVE_TOO_BIG_MESSAGE);
}
+ // total number of entries in the central directory on this disk
+ int numOfEntriesOnThisDisk = numberOfCDInDiskData.get(numberOfThisDisk) == null ? 0 : numberOfCDInDiskData.get(numberOfThisDisk);
+ final byte[] numOfEntriesOnThisDiskData = ZipShort
+ .getBytes(Math.min(numOfEntriesOnThisDisk, ZIP64_MAGIC_SHORT));
+ writeCounted(numOfEntriesOnThisDiskData);
+
+ // number of entries
final byte[] num = ZipShort.getBytes(Math.min(numberOfEntries,
- ZIP64_MAGIC_SHORT));
- writeCounted(num);
+ ZIP64_MAGIC_SHORT));
writeCounted(num);
// length and location of CD
@@ -1408,11 +1501,20 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
return;
}
- final long offset = streamCompressor.getTotalBytesWritten();
+ long offset = streamCompressor.getTotalBytesWritten();
+ long diskNumberStart = 0L;
+ if(isSplitZip) {
+ // when creating a split zip, the offset of should be
+ // the offset to the corresponding segment disk
+ ZipSplitOutputStream zipSplitOutputStream = (ZipSplitOutputStream)this.out;
+ offset = zipSplitOutputStream.getCurrentSplitSegmentBytesWritten();
+ diskNumberStart = zipSplitOutputStream.getCurrentSplitSegmentIndex();
+ }
+
writeOut(ZIP64_EOCD_SIG);
- // size, we don't have any variable length as we don't support
- // the extensible data sector, yet
+ // size of zip64 end of central directory, we don't have any variable length
+ // as we don't support the extensible data sector, yet
writeOut(ZipEightByteInteger
.getBytes(SHORT /* version made by */
+ SHORT /* version needed to extract */
@@ -1428,14 +1530,25 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
writeOut(ZipShort.getBytes(ZIP64_MIN_VERSION));
writeOut(ZipShort.getBytes(ZIP64_MIN_VERSION));
- // disk numbers - four bytes this time
- writeOut(LZERO);
- writeOut(LZERO);
+ // number of this disk
+ long numberOfThisDisk = 0;
+ if (isSplitZip) {
+ numberOfThisDisk = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex();
+ }
+ writeOut(ZipLong.getBytes(numberOfThisDisk));
+
+ // disk number of the start of central directory
+ writeOut(ZipLong.getBytes(cdDiskNumberStart));
+
+ // total number of entries in the central directory on this disk
+ int numOfEntriesOnThisDisk = numberOfCDInDiskData.get(numberOfThisDisk) == null ? 0 : numberOfCDInDiskData.get(numberOfThisDisk);
+ final byte[] numOfEntriesOnThisDiskData = ZipEightByteInteger
+ .getBytes(Math.min(numOfEntriesOnThisDisk, ZIP64_MAGIC_SHORT));
+ writeOut(numOfEntriesOnThisDiskData);
// number of entries
final byte[] num = ZipEightByteInteger.getBytes(entries.size());
writeOut(num);
- writeOut(num);
// length and location of CD
writeOut(ZipEightByteInteger.getBytes(cdLength));
@@ -1443,15 +1556,34 @@ public class ZipArchiveOutputStream extends ArchiveOutputStream {
// no "zip64 extensible data sector" for now
+ if(isSplitZip) {
+ // based on the zip specification, the End Of Central Directory record and
+ // the Zip64 End Of Central Directory locator record must be on the same segment
+ long zip64EOCDLOCLength = WORD /* length of ZIP64_EOCD_LOC_SIG */
+ + WORD /* disk number of ZIP64_EOCD_SIG */
+ + DWORD /* offset of ZIP64_EOCD_SIG */
+ + WORD /* total number of disks */;
+
+ long unsplittableContentSize = zip64EOCDLOCLength + eocdLength;
+ ((ZipSplitOutputStream)this.out).prepareToWriteUnsplittableContent(unsplittableContentSize);
+ }
+
// and now the "ZIP64 end of central directory locator"
writeOut(ZIP64_EOCD_LOC_SIG);
// disk number holding the ZIP64 EOCD record
- writeOut(LZERO);
+ writeOut(ZipLong.getBytes(diskNumberStart));
// relative offset of ZIP64 EOCD record
writeOut(ZipEightByteInteger.getBytes(offset));
// total number of disks
- writeOut(ONE);
+ if(isSplitZip) {
+ // the Zip64 End Of Central Directory Locator and the End Of Central Directory must be
+ // in the same split disk, it means they must be located in the last disk
+ long totalNumberOfDisks = ((ZipSplitOutputStream)this.out).getCurrentSplitSegmentIndex() + 1;
+ writeOut(ZipLong.getBytes(totalNumberOfDisks));
+ } else {
+ writeOut(ONE);
+ }
}
/**
diff --git a/src/main/java/org/apache/commons/compress/archivers/zip/ZipSplitOutputStream.java b/src/main/java/org/apache/commons/compress/archivers/zip/ZipSplitOutputStream.java
new file mode 100644
index 0000000..80a896d
--- /dev/null
+++ b/src/main/java/org/apache/commons/compress/archivers/zip/ZipSplitOutputStream.java
@@ -0,0 +1,219 @@
+package org.apache.commons.compress.archivers.zip;
+
+import org.apache.commons.compress.compressors.FileNameUtil;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+
+public class ZipSplitOutputStream extends OutputStream {
+ private OutputStream outputStream;
+ private File zipFile;
+ private long splitSize;
+ private int currentSplitSegmentIndex = 0;
+ private long currentSplitSegmentBytesWritten = 0;
+ private boolean finished = false;
+
+ /**
+ * 8.5.1 Capacities for split archives are as follows:
+ * <p>
+ * Maximum number of segments = 4,294,967,295 - 1
+ * Maximum .ZIP segment size = 4,294,967,295 bytes (refer to section 8.5.6)
+ * Minimum segment size = 64K
+ * Maximum PKSFX segment size = 2,147,483,647 bytes
+ */
+ private final long ZIP_SEGMENT_MIN_SIZE = 64 * 1024L;
+ private final long ZIP_SEGMENT_MAX_SIZE = 4294967295L;
+
+ /**
+ * Create a split zip. If the zip file is smaller than the split size,
+ * then there will only be one split zip, and its suffix is .zip,
+ * otherwise the split segments should be like .z01, .z02, ... .z(N-1), .zip
+ *
+ * @param zipFile the zip file to write to
+ * @param splitSize the split size
+ */
+ public ZipSplitOutputStream(final File zipFile, final long splitSize) throws IllegalArgumentException, IOException {
+ if (splitSize < ZIP_SEGMENT_MIN_SIZE || splitSize > ZIP_SEGMENT_MAX_SIZE) {
+ throw new IllegalArgumentException("zip split segment size should between 64K and 4,294,967,295");
+ }
+
+ this.zipFile = zipFile;
+ this.splitSize = splitSize;
+
+ this.outputStream = new FileOutputStream(zipFile);
+ // write the zip split signature 0x08074B50 to the zip file
+ writeZipSplitSignature();
+ }
+
+ /**
+ * Some data can not be written to different split segments, for example:
+ * <p>
+ * 4.4.1.5 The end of central directory record and the Zip64 end
+ * of central directory locator record MUST reside on the same
+ * disk when splitting or spanning an archive.
+ *
+ * @param unsplittableContentSize
+ * @throws IllegalArgumentException
+ * @throws IOException
+ */
+ public void prepareToWriteUnsplittableContent(long unsplittableContentSize) throws IllegalArgumentException, IOException {
+ if (unsplittableContentSize > this.splitSize) {
+ throw new IllegalArgumentException("The unsplittable content size is bigger than the split segment size");
+ }
+
+ long bytesRemainingInThisSegment = this.splitSize - this.currentSplitSegmentBytesWritten;
+ if (bytesRemainingInThisSegment < unsplittableContentSize) {
+ openNewSplitSegment();
+ }
+ }
+
+ @Override
+ public void write(int i) throws IOException {
+ byte[] b = ByteBuffer.allocate(4).putInt(i).array();
+ write(b);
+ }
+
+ @Override
+ public void write(byte[] b) throws IOException {
+ write(b, 0, b.length);
+ }
+
+ /**
+ * Write the data to zip split segments, if the remaining space of current split segment
+ * is not enough, then a new split segment should be created
+ *
+ * @param b data to write
+ * @param off offset of the start of data in param b
+ * @param len the length of data to write
+ * @throws IOException
+ */
+ @Override
+ public void write(byte[] b, int off, int len) throws IOException {
+ if (len <= 0) {
+ return;
+ }
+
+ if (currentSplitSegmentBytesWritten >= splitSize) {
+ openNewSplitSegment();
+ write(b, off, len);
+ } else if (currentSplitSegmentBytesWritten + len > splitSize) {
+ int bytesToWriteForThisSegment = (int) splitSize - (int) currentSplitSegmentBytesWritten;
+ write(b, off, bytesToWriteForThisSegment);
+ openNewSplitSegment();
+ write(b, off + bytesToWriteForThisSegment, len - bytesToWriteForThisSegment);
+ } else {
+ outputStream.write(b, off, len);
+ currentSplitSegmentBytesWritten += len;
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (!finished) {
+ finish();
+ }
+ }
+
+ /**
+ * The last zip split segment's suffix should be .zip
+ *
+ * @throws IOException
+ */
+ private void finish() throws IOException {
+ if (finished) {
+ throw new IOException("This archive has already been finished");
+ }
+
+ String zipFileBaseName = FileNameUtil.getBaseName(zipFile.getName());
+ File lastZipSplitSegmentFile = new File(zipFile.getParentFile(), zipFileBaseName + ".zip");
+ outputStream.close();
+ zipFile.renameTo(lastZipSplitSegmentFile);
+ finished = true;
+ }
+
+ /**
+ * Create a new zip split segment and prepare to write to the new segment
+ *
+ * @return
+ * @throws IOException
+ */
+ private OutputStream openNewSplitSegment() throws IOException {
+ File newFile;
+ if (currentSplitSegmentIndex == 0) {
+ outputStream.close();
+ newFile = createNewSplitSegmentFile(1);
+ zipFile.renameTo(newFile);
+ }
+
+ newFile = createNewSplitSegmentFile(null);
+
+
+ OutputStream newFileOutputStream = new FileOutputStream(newFile);
+ outputStream.close();
+ outputStream = newFileOutputStream;
+ currentSplitSegmentBytesWritten = 0;
+ zipFile = newFile;
+ currentSplitSegmentIndex++;
+
+ return newFileOutputStream;
+ }
+
+ /**
+ * Write the zip split signature (0x08074B50) to the head of the first zip split segment
+ *
+ * @throws IOException
+ */
+ private void writeZipSplitSignature() throws IOException {
+ outputStream.write(ZipArchiveOutputStream.DD_SIG);
+ currentSplitSegmentBytesWritten += ZipArchiveOutputStream.DD_SIG.length;
+ }
+
+ /**
+ * Create the new zip split segment, the last zip segment should be .zip, and the zip split segments' suffix should be
+ * like .z01, .z02, .z03, ... .z99, .z100, ..., .z(N-1), .zip
+ * <p>
+ * 8.3.3 Split ZIP files are typically written to the same location
+ * and are subject to name collisions if the spanned name
+ * format is used since each segment will reside on the same
+ * drive. To avoid name collisions, split archives are named
+ * as follows.
+ * <p>
+ * Segment 1 = filename.z01
+ * Segment n-1 = filename.z(n-1)
+ * Segment n = filename.zip
+ * <p>
+ * NOTE:
+ * The zip split segment begin from 1,2,3,... , and we're creating a new segment,
+ * so the new segment suffix should be (currentSplitSegmentIndex + 2)
+ *
+ * @param zipSplitSegmentSuffixIndex
+ * @return
+ * @throws IOException
+ */
+ private File createNewSplitSegmentFile(Integer zipSplitSegmentSuffixIndex) throws IOException {
+ int newZipSplitSegmentSuffixIndex = zipSplitSegmentSuffixIndex == null ? (currentSplitSegmentIndex + 2) : zipSplitSegmentSuffixIndex;
+ String baseName = FileNameUtil.getBaseName(zipFile.getName());
+ String extension = ".z";
+ if (newZipSplitSegmentSuffixIndex <= 9) {
+ extension += "0" + newZipSplitSegmentSuffixIndex;
+ } else {
+ extension += newZipSplitSegmentSuffixIndex;
+ }
+
+ String newFileName = zipFile.getParent() + File.separatorChar + baseName + extension;
+ File newFile = new File(newFileName);
+
+ if (newFile.exists()) {
+ throw new IOException("split zip segment " + newFileName + " already exists");
+ }
+ return newFile;
+ }
+
+ public int getCurrentSplitSegmentIndex() {
+ return currentSplitSegmentIndex;
+ }
+
+ public long getCurrentSplitSegmentBytesWritten() {
+ return currentSplitSegmentBytesWritten;
+ }
+}
diff --git a/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java b/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java
index cc69031..570b12e 100644
--- a/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java
+++ b/src/main/java/org/apache/commons/compress/compressors/FileNameUtil.java
@@ -193,4 +193,19 @@ public class FileNameUtil {
return fileName + defaultExtension;
}
+ public static String getBaseName(String filename) {
+ if (filename == null) {
+ return null;
+ }
+
+ int lastSeparatorPosition = Math.max(filename.lastIndexOf('/'), filename.lastIndexOf('\\'));
+ String name = filename.substring(lastSeparatorPosition + 1);
+
+ int extensionPosition = name.lastIndexOf('.');
+ if(extensionPosition < 0) {
+ return name;
+ }
+
+ return name.substring(0, extensionPosition);
+ }
}