You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2017/10/22 17:32:20 UTC

svn commit: r1812937 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Author: lehmi
Date: Sun Oct 22 17:32:20 2017
New Revision: 1812937

URL: http://svn.apache.org/viewvc?rev=1812937&view=rev
Log:
PDFBOX-3957: search for valid trailer entries when rebuilding the trailer

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1812937&r1=1812936&r2=1812937&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Oct 22 17:32:20 2017
@@ -113,6 +113,11 @@ public class COSParser extends BaseParse
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
 
     /**
+     * trailer-marker.
+     */
+    private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
+
+    /**
      * ObjStream-marker.
      */
     private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' };
@@ -1609,7 +1614,76 @@ public class COSParser extends BaseParse
         }
         return newValue;
     }
-    
+
+    /**
+     * Brute force search for all trailer marker.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private List<COSObjectKey[]> bfSearchForTrailer() throws IOException
+    {
+        List<COSObjectKey[]> trailerDicts = new ArrayList<COSObjectKey[]>();
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        while (!source.isEOF())
+        {
+            // search for trailer marker
+            if (isString(TRAILER_MARKER))
+            {
+                source.seek(source.getPosition() + TRAILER_MARKER.length);
+                try
+                {
+                    skipSpaces();
+                    COSDictionary trailerDict = parseCOSDictionary();
+                    COSObjectKey[] trailerKeys = new COSObjectKey[2];
+                    if (trailerDict.containsKey(COSName.ROOT))
+                    {
+                        COSBase rootObj = trailerDict.getItem(COSName.ROOT);
+                        if (rootObj instanceof COSObject)
+                        {
+                            long objNumber = ((COSObject) rootObj).getObjectNumber();
+                            int genNumber = ((COSObject) rootObj).getGenerationNumber();
+                            trailerKeys[0] = new COSObjectKey(objNumber, genNumber);
+                        }
+                    }
+                    if (trailerDict.containsKey(COSName.INFO))
+                    {
+                        COSBase infoObj = trailerDict.getItem(COSName.INFO);
+                        long objNumber = ((COSObject) infoObj).getObjectNumber();
+                        int genNumber = ((COSObject) infoObj).getGenerationNumber();
+                        trailerKeys[1] = new COSObjectKey(objNumber, genNumber);
+                    }
+                    if (trailerKeys[0] != null || trailerKeys[1] != null)
+                    {
+                        trailerDicts.add(trailerKeys);
+                    }
+                }
+                catch (IOException exception)
+                {
+                    continue;
+                }
+            }
+            source.read();
+        }
+        source.seek(originOffset);
+        // eliminate double entries
+        int trailerdictsSize = trailerDicts.size();
+        if (trailerdictsSize > 1)
+        {
+            COSObjectKey[] first = trailerDicts.get(0);
+            for (int i = trailerdictsSize - 1; i > 0; i--)
+            {
+                COSObjectKey[] other = trailerDicts.get(i);
+                if (first[0].equals(other[0]) && first[1].equals(other[1]))
+                {
+                    trailerDicts.remove(other);
+                }
+            }
+
+        }
+        return trailerDicts;
+    }
+
     /**
      * Brute force search for the last EOF marker.
      * 
@@ -1957,75 +2031,96 @@ public class COSParser extends BaseParse
         xrefTrailerResolver.setStartxref(0);
         trailer = xrefTrailerResolver.getTrailer();
         getDocument().setTrailer(trailer);
-        // search for the different parts of the trailer dictionary
-        for (Entry<COSObjectKey, Long> entry : bfCOSObjectKeyOffsets.entrySet())
+        List<COSObjectKey[]> trailerObjects = bfSearchForTrailer();
+        if (trailerObjects.size() == 1)
         {
-            Long offset = entry.getValue();
-            COSDictionary dictionary = null;
-            // handle compressed objects
-            if (offset < 0)
+            COSObjectKey[] trailerObj = trailerObjects.get(0);
+            COSObjectKey rootKey = trailerObj[0];
+            Long rootOffset = rootKey != null ? bfSearchCOSObjectKeyOffsets.get(rootKey) : null;
+            COSObjectKey infoKey = trailerObj[1];
+            Long infoOffset = infoKey != null ? bfSearchCOSObjectKeyOffsets.get(infoKey) : null;
+            if (rootKey != null && rootOffset != null)
             {
-                COSObject compressedObject = document.getObjectFromPool(entry.getKey());
-                if (compressedObject.getObject() == null)
-                {
-                    parseObjectStream((int) -offset);
-                }
-                COSBase baseObject = compressedObject.getObject();
-                if (baseObject instanceof COSDictionary)
+                COSDictionary rootDict = retrieveCOSDictionary(rootKey, rootOffset);
+                if (rootDict != null && isCatalog(rootDict))
                 {
-                    dictionary = (COSDictionary) baseObject;
+                    trailer.setItem(COSName.ROOT, document.getObjectFromPool(rootKey));
                 }
-                else
+            }
+            if (infoKey != null && infoOffset != null)
+            {
+                COSDictionary infoDict = retrieveCOSDictionary(infoKey, infoOffset);
+                if (infoDict != null && isInfo(infoDict))
                 {
-                    continue;
+                    trailer.setItem(COSName.INFO, document.getObjectFromPool(infoKey));
                 }
             }
-            else
+        }
+        else
+        {
+            // search for the different parts of the trailer dictionary
+            for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet())
             {
-                source.seek(offset);
-                readObjectNumber();
-                readGenerationNumber();
-                readExpectedString(OBJ_MARKER, true);
-                if (source.peek() != '<')
+                COSDictionary dictionary = retrieveCOSDictionary(entry.getKey(), entry.getValue());
+                if (dictionary == null)
                 {
                     continue;
                 }
-                try
+                // document catalog
+                if (isCatalog(dictionary))
                 {
-                    dictionary = parseCOSDictionary();
+                    trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
                 }
-                catch (IOException exception)
+                // info dictionary
+                else if (isInfo(dictionary))
                 {
-                    LOG.debug("Skipped object " + entry.getKey()
-                            + ", either it's corrupt or not a dictionary");
-                    continue;
+                    trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
                 }
+                // encryption dictionary, if existing, is lost
+                // We can't run "Algorithm 2" from PDF specification because of missing ID
             }
-            // document catalog
-            if (isCatalog(dictionary))
+        }
+        trailerWasRebuild = true;
+        return trailer;
+    }
+
+    private COSDictionary retrieveCOSDictionary(COSObjectKey key, Long offset) throws IOException
+    {
+        COSDictionary dictionary = null;
+        // handle compressed objects
+        if (offset < 0)
+        {
+            COSObject compressedObject = document.getObjectFromPool(key);
+            if (compressedObject.getObject() == null)
             {
-                trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey()));
+                parseObjectStream((int) -offset);
             }
-            // info dictionary
-            else if (!dictionary.containsKey(COSName.PARENT)
-                  && !dictionary.containsKey(COSName.A)
-                  && !dictionary.containsKey(COSName.DEST)
-                    && (dictionary.containsKey(COSName.MOD_DATE)
-                            || dictionary.containsKey(COSName.TITLE)
-                            || dictionary.containsKey(COSName.AUTHOR)
-                            || dictionary.containsKey(COSName.SUBJECT)
-                            || dictionary.containsKey(COSName.KEYWORDS)
-                            || dictionary.containsKey(COSName.CREATOR)
-                            || dictionary.containsKey(COSName.PRODUCER)
-                            || dictionary.containsKey(COSName.CREATION_DATE)))
+            COSBase baseObject = compressedObject.getObject();
+            if (baseObject instanceof COSDictionary)
             {
-                trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey()));
+                dictionary = (COSDictionary) baseObject;
             }
-            // encryption dictionary, if existing, is lost
-            // We can't run "Algorithm 2" from PDF specification because of missing ID
         }
-        trailerWasRebuild = true;
-        return trailer;
+        else
+        {
+            source.seek(offset);
+            readObjectNumber();
+            readGenerationNumber();
+            readExpectedString(OBJ_MARKER, true);
+            if (source.peek() != '<')
+            {
+                return null;
+            }
+            try
+            {
+                dictionary = parseCOSDictionary();
+            }
+            catch (IOException exception)
+            {
+                LOG.debug("Skipped object " + key + ", either it's corrupt or not a dictionary");
+            }
+        }
+        return dictionary;
     }
 
     /**
@@ -2091,7 +2186,7 @@ public class COSParser extends BaseParse
      * Tell if the dictionary is a PDF catalog. Override this for an FDF catalog.
      * 
      * @param dictionary
-     * @return
+     * @return true if the given dictionary is a root dictionary
      */
     protected boolean isCatalog(COSDictionary dictionary)
     {
@@ -2099,8 +2194,33 @@ public class COSParser extends BaseParse
     }
 
     /**
-     * This will parse the startxref section from the stream.
-     * The startxref value is ignored.
+     * Tell if the dictionary is an info dictionary.
+     * 
+     * @param dictionary
+     * @return true if the given dictionary is an info dictionary
+     */
+    private boolean isInfo(COSDictionary dictionary)
+    {
+        if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A)
+                || dictionary.containsKey(COSName.DEST))
+        {
+            return false;
+        }
+        if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
+                && !dictionary.containsKey(COSName.AUTHOR)
+                && !dictionary.containsKey(COSName.SUBJECT)
+                && !dictionary.containsKey(COSName.KEYWORDS)
+                && !dictionary.containsKey(COSName.CREATOR)
+                && !dictionary.containsKey(COSName.PRODUCER)
+                && !dictionary.containsKey(COSName.CREATION_DATE))
+        {
+            return false;
+        }
+        return true;
+    }
+
+    /**
+     * This will parse the startxref section from the stream. The startxref value is ignored.
      *
      * @return the startxref value or -1 on parsing error
      * @throws IOException If an IO error occurs.