You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2017/10/01 10:52:21 UTC

svn commit: r1810261 - /pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Author: lehmi
Date: Sun Oct  1 10:52:20 2017
New Revision: 1810261

URL: http://svn.apache.org/viewvc?rev=1810261&view=rev
Log:
PDFBOX-3936: improve end of object detection if the keyword "endobj" object is cut off in the middle

Modified:
    pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1810261&r1=1810260&r2=1810261&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original)
+++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Oct  1 10:52:20 2017
@@ -1414,8 +1414,8 @@ public class COSParser extends BaseParse
             int lastGenID = Integer.MIN_VALUE;
             long lastObjOffset = Long.MIN_VALUE;
             char[] objString = " obj".toCharArray();
-            char[] endobjString = "endobj".toCharArray();
-            boolean endobjFound = false;
+            char[] endobjString = "endo".toCharArray();
+            boolean endOfObjFound = false;
             do
             {
                 source.seek(currentOffset);
@@ -1457,20 +1457,23 @@ public class COSParser extends BaseParse
                                 lastGenID = genID;
                                 lastObjOffset = tempOffset + 1;
                                 currentOffset += objString.length - 1;
-                                endobjFound = false;
+                                endOfObjFound = false;
                             }
                         }
                     }
                 }
+                // check for "endo" as abbreviation for "endobj", as the pdf may be cut off
+                // in the middle of the keyword, see PDFBOX-3936.
+                // We could possibly implement a more intelligent algorithm if necessary
                 else if (isString(endobjString))
                 {
-                    endobjFound = true;
+                    endOfObjFound = true;
                     currentOffset += endobjString.length - 1;
                 }
                 currentOffset++;
             }
             while (currentOffset < lastEOFMarker && !source.isEOF());
-            if ((lastEOFMarker < Long.MAX_VALUE || endobjFound) && lastObjOffset > 0)
+            if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0)
             {
                 // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
                 // the last object id has to be added here so that it can't get lost as there isn't any subsequent