You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2023/03/19 17:23:48 UTC

svn commit: r1908522 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: cos/COSObjectKey.java pdfparser/BaseParser.java

Author: lehmi
Date: Sun Mar 19 17:23:47 2023
New Revision: 1908522

URL: http://svn.apache.org/viewvc?rev=1908522&view=rev
Log:
PDFBOX-5178: introduce object key cache to avoid performance regression for big pdfs

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java?rev=1908522&r1=1908521&r2=1908522&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java Sun Mar 19 17:23:47 2023
@@ -70,11 +70,23 @@ public final class COSObjectKey implemen
         {
             throw new IllegalArgumentException("Generation number must not be a negative value");
         }
-        numberAndGeneration = num << NUMBER_OFFSET | (gen & GENERATION_MASK);
+        numberAndGeneration = computeInternalHash(num, gen);
         this.streamIndex = index;
     }
 
     /**
+     * Calculate the internal hash value for the given object number and generation number.
+     * 
+     * @param num the object number
+     * @param gen the generation number
+     * @return the internal hash for the given values
+     */
+    public static final long computeInternalHash(long num, int gen)
+    {
+        return num << NUMBER_OFFSET | (gen & GENERATION_MASK);
+    }
+
+    /**
      * {@inheritDoc}
      */
     @Override

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1908522&r1=1908521&r2=1908522&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Mar 19 17:23:47 2023
@@ -23,7 +23,8 @@ import java.nio.charset.CharacterCodingE
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.StandardCharsets;
-import java.util.Optional;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -42,8 +43,7 @@ import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.io.RandomAccessRead;
 
 /**
- * This class is used to contain parsing logic that will be used by both the
- * PDFParser and the COSStreamParser.
+ * This class is used to contain parsing logic that will be used by all parsers.
  *
  * @author Ben Litchfield
  */
@@ -57,6 +57,8 @@ public abstract class BaseParser
 
     private final CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder();
 
+    private final Map<Integer, COSObjectKey> keyCache = new HashMap<>();
+
     /**
      * Log instance.
      */
@@ -153,14 +155,23 @@ public abstract class BaseParser
      */
     protected COSObjectKey getObjectKey(long num, int gen)
     {
-        if (document == null || document.getXrefTable() == null)
+        if (document == null || document.getXrefTable().isEmpty())
         {
             return new COSObjectKey(num, gen);
         }
-        Optional<COSObjectKey> foundKey = document.getXrefTable().keySet().stream()
-                .filter(k -> k.getNumber() == num && k.getGeneration() == gen) //
-                .findAny();
-        return foundKey.isPresent() ? foundKey.get() : new COSObjectKey(num, gen);
+        // use a cache to get the COSObjectKey as iterating over the xref-table-map gets slow for big pdfs
+        // in the long run we have to overhaul the object pool or even better remove it
+        Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
+        if (xrefTable.size() > keyCache.size())
+        {
+            for (COSObjectKey key : xrefTable.keySet())
+            {
+                keyCache.computeIfAbsent(key.hashCode(), k -> key);
+            }
+        }
+        int hashCode = Long.hashCode(COSObjectKey.computeInternalHash(num, gen));
+        COSObjectKey foundKey = keyCache.get(hashCode);
+        return foundKey != null ? foundKey : new COSObjectKey(num, gen);
     }
 
     /**
@@ -668,8 +679,10 @@ public abstract class BaseParser
             pbo = parseDirObject();
             if( pbo instanceof COSObject )
             {
+                // the current empty COSObject is replaced with the correct one
+                pbo = null;
                 // We have to check if the expected values are there or not PDFBOX-385
-                if (po.size() > 0 && po.get(po.size() - 1) instanceof COSInteger)
+                if (po.size() > 1 && po.get(po.size() - 1) instanceof COSInteger)
                 {
                     COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
                     if (po.size() > 0 && po.get(po.size() - 1) instanceof COSInteger)
@@ -685,25 +698,12 @@ public abstract class BaseParser
                         {
                             LOG.warn("Invalid value(s) for an object key " + number.longValue()
                                     + " " + genNumber.intValue());
-                            pbo = null;
                         }
                     }
-                    else
-                    {
-                        // the object reference is somehow wrong
-                        pbo = null;
-                    }
-                }
-                else
-                {
-                    pbo = null;
                 }
             }
-            if( pbo != null )
-            {
-                po.add( pbo );
-            }
-            else
+            // something went wrong
+            if (pbo == null)
             {
                 //it could be a bad object in the array which is just skipped
                 LOG.warn("Corrupt array element at offset " + source.getPosition()
@@ -723,6 +723,7 @@ public abstract class BaseParser
                     return po;
                 }
             }
+            po.add(pbo);
             skipSpaces();
         }
         // read ']'