You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2023/01/06 13:13:17 UTC
svn commit: r1906422 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/pdfparser/COSParser.java main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
Author: lehmi
Date: Fri Jan 6 13:13:17 2023
New Revision: 1906422
URL: http://svn.apache.org/viewvc?rev=1906422&view=rev
Log:
PDFBOX-5178: use index value to choose correct object if the object numbers within an object stream are not unique
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1906422&r1=1906421&r2=1906422&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Fri Jan 6 13:13:17 2023
@@ -129,7 +129,7 @@ public class COSParser extends BaseParse
* Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
* dereferencing them.
*/
- private final Map<Long, Map<Long, COSBase>> decompressedObjects = new HashMap<>();
+ private final Map<Long, Map<COSObjectKey, COSBase>> decompressedObjects = new HashMap<>();
/**
* The security handler.
@@ -765,11 +765,10 @@ public class COSParser extends BaseParse
*/
protected COSBase parseObjectStreamObject(long objstmObjNr, COSObjectKey key) throws IOException
{
- Map<Long, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
+ Map<COSObjectKey, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
n -> new HashMap<>());
// did we already read the compressed object stream?
- long keyNumber = key.getNumber();
- COSBase objectStreamObject = streamObjects.remove(keyNumber);
+ COSBase objectStreamObject = streamObjects.remove(key);
if (objectStreamObject != null)
{
return objectStreamObject;
@@ -782,18 +781,10 @@ public class COSParser extends BaseParse
{
PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj,
document);
- for (Entry<Long, COSBase> entry : parser.parseAllObjects().entrySet())
- {
- Long stmObjNumber = entry.getKey();
- if (keyNumber == stmObjNumber)
- {
- objectStreamObject = entry.getValue();
- }
- else
- {
- streamObjects.putIfAbsent(stmObjNumber, entry.getValue());
- }
- }
+ Map<COSObjectKey, COSBase> allStreamObjects = parser.parseAllObjects();
+ objectStreamObject = allStreamObjects.remove(key);
+ allStreamObjects.entrySet().stream()
+ .forEach(e -> streamObjects.putIfAbsent(e.getKey(), e.getValue()));
}
catch (IOException ex)
{
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1906422&r1=1906421&r2=1906422&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Fri Jan 6 13:13:17 2023
@@ -25,6 +25,7 @@ import java.util.TreeMap;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObjectKey;
import org.apache.pdfbox.cos.COSStream;
/**
@@ -116,19 +117,36 @@ public class PDFObjectStreamParser exten
* @return a map containing all parsed objects using the object number as key
* @throws IOException if there is an error while parsing the stream
*/
- public Map<Long, COSBase> parseAllObjects() throws IOException
+ public Map<COSObjectKey, COSBase> parseAllObjects() throws IOException
{
- Map<Long, COSBase> allObjects = new HashMap<>();
+ Map<COSObjectKey, COSBase> allObjects = new HashMap<>();
try
{
Map<Integer, Long> objectNumbers = privateReadObjectOffsets();
+ // count the number of object numbers eliminating double entries
+ long numberOfObjNumbers = objectNumbers.values().stream().distinct().count();
+ // the usage of the index should be restricted to cases where more than one
+ // object use the same object number.
+ // there are malformed pdfs in the wild which would lead to false results if
+ // pdfbox always relies on the index if available. In most cases the object number
+ // is sufficient to choose the correct object
+ boolean indexNeeded = objectNumbers.size() > numberOfObjNumbers;
long currentPosition = source.getPosition();
if (firstObject > 0 && currentPosition < firstObject)
{
source.skip(firstObject - (int) currentPosition);
}
+ int index = 0;
for (Entry<Integer, Long> entry : objectNumbers.entrySet())
{
+ COSObjectKey objectKey = getObjectKey(entry.getValue(), 0);
+ // skip object if the index doesn't match
+ if (indexNeeded && objectKey.getStreamIndex() > -1
+ && objectKey.getStreamIndex() != index)
+ {
+ index++;
+ continue;
+ }
int finalPosition = firstObject + entry.getKey();
currentPosition = source.getPosition();
if (finalPosition > 0 && currentPosition < finalPosition)
@@ -141,7 +159,8 @@ public class PDFObjectStreamParser exten
{
streamObject.setDirect(false);
}
- allObjects.put(entry.getValue(), streamObject);
+ allObjects.put(objectKey, streamObject);
+ index++;
}
}
finally
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java?rev=1906422&r1=1906421&r2=1906422&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java Fri Jan 6 13:13:17 2023
@@ -24,8 +24,10 @@ import java.util.Map;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
+import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObjectKey;
import org.apache.pdfbox.cos.COSStream;
import org.junit.jupiter.api.Test;
@@ -64,10 +66,89 @@ class PDFObjectStreamParserTest
outputStream.write("6 0 4 5 true false".getBytes());
outputStream.close();
PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, null);
- Map<Long, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+ Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
assertEquals(2, objectNumbers.size());
- assertEquals(COSBoolean.TRUE, objectNumbers.get(6L));
- assertEquals(COSBoolean.FALSE, objectNumbers.get(4L));
+ assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+ assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0)));
+ }
+
+ @Test
+ void testParseAllObjectsIndexed() throws IOException
+ {
+ COSStream stream = new COSStream();
+ stream.setItem(COSName.N, COSInteger.THREE);
+ stream.setItem(COSName.FIRST, COSInteger.get(13));
+ OutputStream outputStream = stream.createOutputStream();
+ // use object number 4 for two objects
+ outputStream.write("6 0 4 5 4 11 true false true".getBytes());
+ outputStream.close();
+ COSDocument cosDoc = new COSDocument();
+ Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+ // select the second object from the stream for object number 4 by using 2 as value for the index
+ xrefTable.put(new COSObjectKey(6, 0, 0), -1L);
+ xrefTable.put(new COSObjectKey(4, 0, 2), -1L);
+ PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+ Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+ assertEquals(2, objectNumbers.size());
+ assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+ assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(4, 0)));
+
+ // select the first object from the stream for object number 4 by using 1 as value for the index
+ // remove the old entry first to be sure it is replaced
+ xrefTable.remove(new COSObjectKey(4, 0));
+ xrefTable.put(new COSObjectKey(4, 0, 1), -1L);
+ objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+ objectNumbers = objectStreamParser.parseAllObjects();
+ assertEquals(2, objectNumbers.size());
+ assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+ assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0)));
+ }
+
+ @Test
+ void testParseAllObjectsSkipMalformedIndex() throws IOException
+ {
+ COSStream stream = new COSStream();
+ stream.setItem(COSName.N, COSInteger.THREE);
+ stream.setItem(COSName.FIRST, COSInteger.get(13));
+ OutputStream outputStream = stream.createOutputStream();
+ outputStream.write("6 0 4 5 5 11 true false true".getBytes());
+ outputStream.close();
+ COSDocument cosDoc = new COSDocument();
+ Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+ // add an index for each object key which doesn't match with the index of the object stream
+ xrefTable.put(new COSObjectKey(6, 0, 10), -1L);
+ xrefTable.put(new COSObjectKey(4, 0, 11), -1L);
+ xrefTable.put(new COSObjectKey(5, 0, 12), -1L);
+ PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+ // the index isn't taken into account as all object numbers of the stream are unique
+ // none of the objects is skipped so that all objects are read and available
+ Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+ assertEquals(3, objectNumbers.size());
+ assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+ assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0)));
+ assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(5, 0)));
+ }
+
+ @Test
+ void testParseAllObjectsUseMalformedIndex() throws IOException
+ {
+ COSStream stream = new COSStream();
+ stream.setItem(COSName.N, COSInteger.THREE);
+ stream.setItem(COSName.FIRST, COSInteger.get(13));
+ OutputStream outputStream = stream.createOutputStream();
+ outputStream.write("6 0 4 5 4 11 true false true".getBytes());
+ outputStream.close();
+ COSDocument cosDoc = new COSDocument();
+ Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+ // add an index for each object key which doesn't match with the index of the object stream
+ // add two object keys only as the object stream uses one object number for two objects
+ xrefTable.put(new COSObjectKey(6, 0, 10), -1L);
+ xrefTable.put(new COSObjectKey(4, 0, 11), -1L);
+ PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+ // as the used object numbers aren't unique within the object the index of the obejct keys is used
+ // All objects are dropped as the malformed index values don't match the index of the object within the stream
+ Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+ assertEquals(0, objectNumbers.size());
}
}