You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2023/03/19 17:23:48 UTC
svn commit: r1908522 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: cos/COSObjectKey.java pdfparser/BaseParser.java
Author: lehmi
Date: Sun Mar 19 17:23:47 2023
New Revision: 1908522
URL: http://svn.apache.org/viewvc?rev=1908522&view=rev
Log:
PDFBOX-5178: introduce object key cache to avoid performance regression for big pdfs
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java?rev=1908522&r1=1908521&r2=1908522&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSObjectKey.java Sun Mar 19 17:23:47 2023
@@ -70,11 +70,23 @@ public final class COSObjectKey implemen
{
throw new IllegalArgumentException("Generation number must not be a negative value");
}
- numberAndGeneration = num << NUMBER_OFFSET | (gen & GENERATION_MASK);
+ numberAndGeneration = computeInternalHash(num, gen);
this.streamIndex = index;
}
/**
+ * Calculate the internal hash value for the given object number and generation number.
+ *
+ * @param num the object number
+ * @param gen the generation number
+ * @return the internal hash for the given values
+ */
+ public static final long computeInternalHash(long num, int gen)
+ {
+ return num << NUMBER_OFFSET | (gen & GENERATION_MASK);
+ }
+
+ /**
* {@inheritDoc}
*/
@Override
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1908522&r1=1908521&r2=1908522&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Mar 19 17:23:47 2023
@@ -23,7 +23,8 @@ import java.nio.charset.CharacterCodingE
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
-import java.util.Optional;
+import java.util.HashMap;
+import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -42,8 +43,7 @@ import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.RandomAccessRead;
/**
- * This class is used to contain parsing logic that will be used by both the
- * PDFParser and the COSStreamParser.
+ * This class is used to contain parsing logic that will be used by all parsers.
*
* @author Ben Litchfield
*/
@@ -57,6 +57,8 @@ public abstract class BaseParser
private final CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder();
+ private final Map<Integer, COSObjectKey> keyCache = new HashMap<>();
+
/**
* Log instance.
*/
@@ -153,14 +155,23 @@ public abstract class BaseParser
*/
protected COSObjectKey getObjectKey(long num, int gen)
{
- if (document == null || document.getXrefTable() == null)
+ if (document == null || document.getXrefTable().isEmpty())
{
return new COSObjectKey(num, gen);
}
- Optional<COSObjectKey> foundKey = document.getXrefTable().keySet().stream()
- .filter(k -> k.getNumber() == num && k.getGeneration() == gen) //
- .findAny();
- return foundKey.isPresent() ? foundKey.get() : new COSObjectKey(num, gen);
+ // use a cache to get the COSObjectKey as iterating over the xref-table-map gets slow for big pdfs
+ // in the long run we have to overhaul the object pool or even better remove it
+ Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
+ if (xrefTable.size() > keyCache.size())
+ {
+ for (COSObjectKey key : xrefTable.keySet())
+ {
+ keyCache.computeIfAbsent(key.hashCode(), k -> key);
+ }
+ }
+ int hashCode = Long.hashCode(COSObjectKey.computeInternalHash(num, gen));
+ COSObjectKey foundKey = keyCache.get(hashCode);
+ return foundKey != null ? foundKey : new COSObjectKey(num, gen);
}
/**
@@ -668,8 +679,10 @@ public abstract class BaseParser
pbo = parseDirObject();
if( pbo instanceof COSObject )
{
+ // the current empty COSObject is replaced with the correct one
+ pbo = null;
// We have to check if the expected values are there or not PDFBOX-385
- if (po.size() > 0 && po.get(po.size() - 1) instanceof COSInteger)
+ if (po.size() > 1 && po.get(po.size() - 1) instanceof COSInteger)
{
COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
if (po.size() > 0 && po.get(po.size() - 1) instanceof COSInteger)
@@ -685,25 +698,12 @@ public abstract class BaseParser
{
LOG.warn("Invalid value(s) for an object key " + number.longValue()
+ " " + genNumber.intValue());
- pbo = null;
}
}
- else
- {
- // the object reference is somehow wrong
- pbo = null;
- }
- }
- else
- {
- pbo = null;
}
}
- if( pbo != null )
- {
- po.add( pbo );
- }
- else
+ // something went wrong
+ if (pbo == null)
{
//it could be a bad object in the array which is just skipped
LOG.warn("Corrupt array element at offset " + source.getPosition()
@@ -723,6 +723,7 @@ public abstract class BaseParser
return po;
}
}
+ po.add(pbo);
skipSpaces();
}
// read ']'