You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2021/11/13 12:11:37 UTC

svn commit: r1894994 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: COSParser.java PDFObjectStreamParser.java

Author: lehmi
Date: Sat Nov 13 12:11:37 2021
New Revision: 1894994

URL: http://svn.apache.org/viewvc?rev=1894994&view=rev
Log:
PDFBOX-5286: read and cache all objects of a compressed object stream instead of parsing it again and again for every single object of the stream

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1894994&r1=1894993&r2=1894994&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sat Nov 13 12:11:37 2021
@@ -142,6 +142,12 @@ public class COSParser extends BaseParse
     private PDEncryption encryption = null;
 
     /**
+     * Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
+     * dereferencing them.
+     */
+    private final Map<Integer, Map<Long, COSBase>> decompressedObjects = new HashMap<>();
+
+    /**
      * The security handler.
      */
     protected SecurityHandler<? extends ProtectionPolicy> securityHandler = null;
@@ -761,20 +767,33 @@ public class COSParser extends BaseParse
      */
     protected COSBase parseObjectStreamObject(int objstmObjNr, COSObjectKey key) throws IOException
     {
+        Map<Long, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
+                n -> new HashMap<>());
+        // did we already read the compressed object stream?
+        COSBase objectStreamObject = streamObjects.remove(key.getNumber());
+        if (objectStreamObject != null)
+        {
+            return objectStreamObject;
+        }
         final COSObjectKey objKey = new COSObjectKey(objstmObjNr, 0);
         final COSBase objstmBaseObj = document.getObjectFromPool(objKey).getObject();
-        COSBase objectStreamObject = null;
         if (objstmBaseObj instanceof COSStream)
         {
-            // parse object stream
-            PDFObjectStreamParser parser = null;
             try
             {
-                parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document);
-                objectStreamObject = parser.parseObject(key.getNumber());
-                if (objectStreamObject != null)
+                PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj,
+                        document);
+                for (Entry<Long, COSBase> entry : parser.parseAllObjects().entrySet())
                 {
-                    objectStreamObject.setKey(key);
+                    Long stmObjNumber = entry.getKey();
+                    if (key.getNumber() == stmObjNumber)
+                    {
+                        objectStreamObject = entry.getValue();
+                    }
+                    else
+                    {
+                        streamObjects.putIfAbsent(stmObjNumber, entry.getValue());
+                    }
                 }
             }
             catch (IOException ex)
@@ -792,7 +811,7 @@ public class COSParser extends BaseParse
         }
         return objectStreamObject;
     }
-    
+
     /** 
      * Returns length value referred to or defined in given object. 
      */

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1894994&r1=1894993&r2=1894994&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Sat Nov 13 12:11:37 2021
@@ -19,6 +19,8 @@ package org.apache.pdfbox.pdfparser;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
 
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDocument;
@@ -83,7 +85,7 @@ public class PDFObjectStreamParser exten
         try
         {
             Integer objectOffset = privateReadObjectNumbers().get(objectNumber);
-            if (objectOffset != null) 
+            if (objectOffset != null)
             {
                 // jump to the offset of the first object
                 long currentPosition = source.getPosition();
@@ -108,6 +110,48 @@ public class PDFObjectStreamParser exten
         return streamObject;
     }
 
+    /**
+     * Parse all compressed objects. The stream is closed after parsing.
+     * 
+     * @return a map containing all parsed objects using the object number as key
+     * @throws IOException if there is an error while parsing the stream
+     */
+    public Map<Long, COSBase> parseAllObjects() throws IOException
+    {
+        Map<Long, COSBase> allObjects = new HashMap<>();
+        try
+        {
+            Map<Integer, Long> objectNumbers = privateReadObjectOffets();
+            long currentPosition = source.getPosition();
+            if (firstObject > 0 && currentPosition < firstObject)
+            {
+                source.skip(firstObject - (int) currentPosition);
+            }
+            for (Entry<Integer, Long> entry : objectNumbers.entrySet())
+            {
+                int finalPosition = firstObject + entry.getKey();
+                currentPosition = source.getPosition();
+                if (finalPosition > 0 && currentPosition < finalPosition)
+                {
+                    // jump to the offset of the object to be parsed
+                    source.skip(finalPosition - (int) currentPosition);
+                }
+                COSBase streamObject = parseDirObject();
+                if (streamObject != null)
+                {
+                    streamObject.setDirect(false);
+                }
+                allObjects.put(entry.getValue(), streamObject);
+            }
+        }
+        finally
+        {
+            source.close();
+            document = null;
+        }
+        return allObjects;
+    }
+
     private Map<Long, Integer> privateReadObjectNumbers() throws IOException
     {
         // don't initialize map using numberOfObjects as there might by less object numbers than expected
@@ -127,6 +171,27 @@ public class PDFObjectStreamParser exten
         return objectNumbers;
     }
 
+    private Map<Integer, Long> privateReadObjectOffets() throws IOException
+    {
+        // according to the pdf spec the offsets shall be sorted ascending
+        // but we can't rely on that, so that we have to sort the offsets
+        // as the sequential parsers relies on it, see PDFBOX-4927
+        Map<Integer, Long> objectOffsets = new TreeMap<>();
+        long firstObjectPosition = source.getPosition() + firstObject - 1;
+        for (int i = 0; i < numberOfObjects; i++)
+        {
+            // don't read beyond the part of the stream reserved for the object numbers
+            if (source.getPosition() >= firstObjectPosition)
+            {
+                break;
+            }
+            long objectNumber = readObjectNumber();
+            int offset = (int) readLong();
+            objectOffsets.put(offset, objectNumber);
+        }
+        return objectOffsets;
+    }
+
     /**
      * Read all object numbers from the compressed object stream. The stream is closed after reading the object numbers.
      *