You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2023/01/06 13:13:17 UTC

svn commit: r1906422 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/pdfparser/COSParser.java main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java

Author: lehmi
Date: Fri Jan  6 13:13:17 2023
New Revision: 1906422

URL: http://svn.apache.org/viewvc?rev=1906422&view=rev
Log:
PDFBOX-5178: use index value to choose correct object if the object numbers within an object stream are not unique

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1906422&r1=1906421&r2=1906422&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Fri Jan  6 13:13:17 2023
@@ -129,7 +129,7 @@ public class COSParser extends BaseParse
      * Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after
      * dereferencing them.
      */
-    private final Map<Long, Map<Long, COSBase>> decompressedObjects = new HashMap<>();
+    private final Map<Long, Map<COSObjectKey, COSBase>> decompressedObjects = new HashMap<>();
 
     /**
      * The security handler.
@@ -765,11 +765,10 @@ public class COSParser extends BaseParse
      */
     protected COSBase parseObjectStreamObject(long objstmObjNr, COSObjectKey key) throws IOException
     {
-        Map<Long, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
+        Map<COSObjectKey, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr,
                 n -> new HashMap<>());
         // did we already read the compressed object stream?
-        long keyNumber = key.getNumber();
-        COSBase objectStreamObject = streamObjects.remove(keyNumber);
+        COSBase objectStreamObject = streamObjects.remove(key);
         if (objectStreamObject != null)
         {
             return objectStreamObject;
@@ -782,18 +781,10 @@ public class COSParser extends BaseParse
             {
                 PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj,
                         document);
-                for (Entry<Long, COSBase> entry : parser.parseAllObjects().entrySet())
-                {
-                    Long stmObjNumber = entry.getKey();
-                    if (keyNumber == stmObjNumber)
-                    {
-                        objectStreamObject = entry.getValue();
-                    }
-                    else
-                    {
-                        streamObjects.putIfAbsent(stmObjNumber, entry.getValue());
-                    }
-                }
+                Map<COSObjectKey, COSBase> allStreamObjects = parser.parseAllObjects();
+                objectStreamObject = allStreamObjects.remove(key);
+                allStreamObjects.entrySet().stream()
+                        .forEach(e -> streamObjects.putIfAbsent(e.getKey(), e.getValue()));
             }
             catch (IOException ex)
             {

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1906422&r1=1906421&r2=1906422&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Fri Jan  6 13:13:17 2023
@@ -25,6 +25,7 @@ import java.util.TreeMap;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObjectKey;
 import org.apache.pdfbox.cos.COSStream;
 
 /**
@@ -116,19 +117,36 @@ public class PDFObjectStreamParser exten
      * @return a map containing all parsed objects using the object number as key
      * @throws IOException if there is an error while parsing the stream
      */
-    public Map<Long, COSBase> parseAllObjects() throws IOException
+    public Map<COSObjectKey, COSBase> parseAllObjects() throws IOException
     {
-        Map<Long, COSBase> allObjects = new HashMap<>();
+        Map<COSObjectKey, COSBase> allObjects = new HashMap<>();
         try
         {
             Map<Integer, Long> objectNumbers = privateReadObjectOffsets();
+            // count the number of object numbers eliminating double entries
+            long numberOfObjNumbers = objectNumbers.values().stream().distinct().count();
+            // the usage of the index should be restricted to cases where more than one
+            // object use the same object number.
+            // there are malformed pdfs in the wild which would lead to false results if
+            // pdfbox always relies on the index if available. In most cases the object number
+            // is sufficient to choose the correct object
+            boolean indexNeeded = objectNumbers.size() > numberOfObjNumbers;
             long currentPosition = source.getPosition();
             if (firstObject > 0 && currentPosition < firstObject)
             {
                 source.skip(firstObject - (int) currentPosition);
             }
+            int index = 0;
             for (Entry<Integer, Long> entry : objectNumbers.entrySet())
             {
+                COSObjectKey objectKey = getObjectKey(entry.getValue(), 0);
+                // skip object if the index doesn't match
+                if (indexNeeded && objectKey.getStreamIndex() > -1
+                        && objectKey.getStreamIndex() != index)
+                {
+                    index++;
+                    continue;
+                }
                 int finalPosition = firstObject + entry.getKey();
                 currentPosition = source.getPosition();
                 if (finalPosition > 0 && currentPosition < finalPosition)
@@ -141,7 +159,8 @@ public class PDFObjectStreamParser exten
                 {
                     streamObject.setDirect(false);
                 }
-                allObjects.put(entry.getValue(), streamObject);
+                allObjects.put(objectKey, streamObject);
+                index++;
             }
         }
         finally

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java?rev=1906422&r1=1906421&r2=1906422&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java Fri Jan  6 13:13:17 2023
@@ -24,8 +24,10 @@ import java.util.Map;
 
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSBoolean;
+import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObjectKey;
 import org.apache.pdfbox.cos.COSStream;
 import org.junit.jupiter.api.Test;
 
@@ -64,10 +66,89 @@ class PDFObjectStreamParserTest
         outputStream.write("6 0 4 5 true false".getBytes());
         outputStream.close();
         PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, null);
-        Map<Long, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+        Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
         assertEquals(2, objectNumbers.size());
-        assertEquals(COSBoolean.TRUE, objectNumbers.get(6L));
-        assertEquals(COSBoolean.FALSE, objectNumbers.get(4L));
+        assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+        assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0)));
+    }
+
+    @Test
+    void testParseAllObjectsIndexed() throws IOException
+    {
+        COSStream stream = new COSStream();
+        stream.setItem(COSName.N, COSInteger.THREE);
+        stream.setItem(COSName.FIRST, COSInteger.get(13));
+        OutputStream outputStream = stream.createOutputStream();
+        // use object number 4 for two objects
+        outputStream.write("6 0 4 5 4 11 true false true".getBytes());
+        outputStream.close();
+        COSDocument cosDoc = new COSDocument();
+        Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+        // select the second object from the stream for object number 4 by using 2 as value for the index
+        xrefTable.put(new COSObjectKey(6, 0, 0), -1L);
+        xrefTable.put(new COSObjectKey(4, 0, 2), -1L);
+        PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+        Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+        assertEquals(2, objectNumbers.size());
+        assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+        assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(4, 0)));
+
+        // select the first object from the stream for object number 4 by using 1 as value for the index
+        // remove the old entry first to be sure it is replaced
+        xrefTable.remove(new COSObjectKey(4, 0));
+        xrefTable.put(new COSObjectKey(4, 0, 1), -1L);
+        objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+        objectNumbers = objectStreamParser.parseAllObjects();
+        assertEquals(2, objectNumbers.size());
+        assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+        assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0)));
+    }
+
+    @Test
+    void testParseAllObjectsSkipMalformedIndex() throws IOException
+    {
+        COSStream stream = new COSStream();
+        stream.setItem(COSName.N, COSInteger.THREE);
+        stream.setItem(COSName.FIRST, COSInteger.get(13));
+        OutputStream outputStream = stream.createOutputStream();
+        outputStream.write("6 0 4 5 5 11 true false true".getBytes());
+        outputStream.close();
+        COSDocument cosDoc = new COSDocument();
+        Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+        // add an index for each object key which doesn't match with the index of the object stream
+        xrefTable.put(new COSObjectKey(6, 0, 10), -1L);
+        xrefTable.put(new COSObjectKey(4, 0, 11), -1L);
+        xrefTable.put(new COSObjectKey(5, 0, 12), -1L);
+        PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+        // the index isn't taken into account as all object numbers of the stream are unique
+        // none of the objects is skipped so that all objects are read and available
+        Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+        assertEquals(3, objectNumbers.size());
+        assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0)));
+        assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0)));
+        assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(5, 0)));
+    }
+
+    @Test
+    void testParseAllObjectsUseMalformedIndex() throws IOException
+    {
+        COSStream stream = new COSStream();
+        stream.setItem(COSName.N, COSInteger.THREE);
+        stream.setItem(COSName.FIRST, COSInteger.get(13));
+        OutputStream outputStream = stream.createOutputStream();
+        outputStream.write("6 0 4 5 4 11 true false true".getBytes());
+        outputStream.close();
+        COSDocument cosDoc = new COSDocument();
+        Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
+        // add an index for each object key which doesn't match with the index of the object stream
+        // add two object keys only as the object stream uses one object number for two objects
+        xrefTable.put(new COSObjectKey(6, 0, 10), -1L);
+        xrefTable.put(new COSObjectKey(4, 0, 11), -1L);
+        PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc);
+        // as the used object numbers aren't unique within the object the index of the obejct keys is used
+        // All objects are dropped as the malformed index values don't match the index of the object within the stream
+        Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects();
+        assertEquals(0, objectNumbers.size());
     }
 
 }