You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2021/11/13 12:11:37 UTC
svn commit: r1894994 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: COSParser.java PDFObjectStreamParser.java
Author: lehmi
Date: Sat Nov 13 12:11:37 2021
New Revision: 1894994
URL: http://svn.apache.org/viewvc?rev=1894994&view=rev
Log:
PDFBOX-5286: read and cache all objects of a compressed object stream instead of parsing it again and again for every single object of the stream
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1894994&r1=1894993&r2=1894994&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sat Nov 13 12:11:37 2021
@@ -142,6 +142,12 @@ public class COSParser extends BaseParse
private PDEncryption encryption = null;
/**
+ * Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
+ * dereferencing them.
+ */
+ private final Map<Integer, Map<Long, COSBase>> decompressedObjects = new HashMap<>();
+
+ /**
* The security handler.
*/
protected SecurityHandler<? extends ProtectionPolicy> securityHandler = null;
@@ -761,20 +767,33 @@ public class COSParser extends BaseParse
*/
protected COSBase parseObjectStreamObject(int objstmObjNr, COSObjectKey key) throws IOException
{
+ Map<Long, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
+ n -> new HashMap<>());
+ // did we already read the compressed object stream?
+ COSBase objectStreamObject = streamObjects.remove(key.getNumber());
+ if (objectStreamObject != null)
+ {
+ return objectStreamObject;
+ }
final COSObjectKey objKey = new COSObjectKey(objstmObjNr, 0);
final COSBase objstmBaseObj = document.getObjectFromPool(objKey).getObject();
- COSBase objectStreamObject = null;
if (objstmBaseObj instanceof COSStream)
{
- // parse object stream
- PDFObjectStreamParser parser = null;
try
{
- parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document);
- objectStreamObject = parser.parseObject(key.getNumber());
- if (objectStreamObject != null)
+ PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj,
+ document);
+ for (Entry<Long, COSBase> entry : parser.parseAllObjects().entrySet())
{
- objectStreamObject.setKey(key);
+ Long stmObjNumber = entry.getKey();
+ if (key.getNumber() == stmObjNumber)
+ {
+ objectStreamObject = entry.getValue();
+ }
+ else
+ {
+ streamObjects.putIfAbsent(stmObjNumber, entry.getValue());
+ }
}
}
catch (IOException ex)
@@ -792,7 +811,7 @@ public class COSParser extends BaseParse
}
return objectStreamObject;
}
-
+
/**
* Returns length value referred to or defined in given object.
*/
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1894994&r1=1894993&r2=1894994&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Sat Nov 13 12:11:37 2021
@@ -19,6 +19,8 @@ package org.apache.pdfbox.pdfparser;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDocument;
@@ -83,7 +85,7 @@ public class PDFObjectStreamParser exten
try
{
Integer objectOffset = privateReadObjectNumbers().get(objectNumber);
- if (objectOffset != null)
+ if (objectOffset != null)
{
// jump to the offset of the first object
long currentPosition = source.getPosition();
@@ -108,6 +110,48 @@ public class PDFObjectStreamParser exten
return streamObject;
}
+ /**
+ * Parse all compressed objects. The stream is closed after parsing.
+ *
+ * @return a map containing all parsed objects using the object number as key
+ * @throws IOException if there is an error while parsing the stream
+ */
+ public Map<Long, COSBase> parseAllObjects() throws IOException
+ {
+ Map<Long, COSBase> allObjects = new HashMap<>();
+ try
+ {
+ Map<Integer, Long> objectNumbers = privateReadObjectOffets();
+ long currentPosition = source.getPosition();
+ if (firstObject > 0 && currentPosition < firstObject)
+ {
+ source.skip(firstObject - (int) currentPosition);
+ }
+ for (Entry<Integer, Long> entry : objectNumbers.entrySet())
+ {
+ int finalPosition = firstObject + entry.getKey();
+ currentPosition = source.getPosition();
+ if (finalPosition > 0 && currentPosition < finalPosition)
+ {
+ // jump to the offset of the object to be parsed
+ source.skip(finalPosition - (int) currentPosition);
+ }
+ COSBase streamObject = parseDirObject();
+ if (streamObject != null)
+ {
+ streamObject.setDirect(false);
+ }
+ allObjects.put(entry.getValue(), streamObject);
+ }
+ }
+ finally
+ {
+ source.close();
+ document = null;
+ }
+ return allObjects;
+ }
+
private Map<Long, Integer> privateReadObjectNumbers() throws IOException
{
// don't initialize map using numberOfObjects as there might by less object numbers than expected
@@ -127,6 +171,27 @@ public class PDFObjectStreamParser exten
return objectNumbers;
}
+ private Map<Integer, Long> privateReadObjectOffets() throws IOException
+ {
+ // according to the pdf spec the offsets shall be sorted ascending
+ // but we can't rely on that, so that we have to sort the offsets
+ // as the sequential parsers relies on it, see PDFBOX-4927
+ Map<Integer, Long> objectOffsets = new TreeMap<>();
+ long firstObjectPosition = source.getPosition() + firstObject - 1;
+ for (int i = 0; i < numberOfObjects; i++)
+ {
+ // don't read beyond the part of the stream reserved for the object numbers
+ if (source.getPosition() >= firstObjectPosition)
+ {
+ break;
+ }
+ long objectNumber = readObjectNumber();
+ int offset = (int) readLong();
+ objectOffsets.put(offset, objectNumber);
+ }
+ return objectOffsets;
+ }
+
/**
* Read all object numbers from the compressed object stream. The stream is closed after reading the object numbers.
*