You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ad...@apache.org on 2010/08/25 01:11:01 UTC

svn commit: r988757 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java

Author: adam
Date: Tue Aug 24 23:11:01 2010
New Revision: 988757

URL: http://svn.apache.org/viewvc?rev=988757&view=rev
Log:
PDFBOX-803: Improved handling erronous data between endstream and endobj lines

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=988757&r1=988756&r2=988757&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Tue Aug 24 23:11:01 2010
@@ -543,7 +543,8 @@ public class PDFParser extends BaseParse
                     // the combination of a dict and the stream/endstream forms a complete stream object
                     throw new IOException("stream not preceded by dictionary");
                 }
-                endObjectKey = readString();
+                skipSpaces();
+                endObjectKey = readLine();
             }
             
             COSObjectKey key = new COSObjectKey( number, genNum );
@@ -563,18 +564,27 @@ public class PDFParser extends BaseParse
             
             if( !endObjectKey.equals( "endobj" ) )
             {
-                               if (endObjectKey.startsWith( "endobj" ) ) 
-                               {
-                                       /*
-                                         * Some PDF files don't contain a new line after endobj so we 
-                                         * need to make sure that the next object number is getting read separately
-                                         * and not part of the endobj keyword. Ex. Some files would have "endobj28"
-                                         * instead of "endobj"
-                                         */
-                                        pdfSource.unread( endObjectKey.substring( 6 ).getBytes() );
-                                    } 
-                                    else if( !pdfSource.isEOF() )                
-                                    {
+                if (endObjectKey.startsWith( "endobj" ) ) 
+                {
+                    /*
+                     * Some PDF files don't contain a new line after endobj so we 
+                     * need to make sure that the next object number is getting read separately
+                     * and not part of the endobj keyword. Ex. Some files would have "endobj28"
+                     * instead of "endobj"
+                     */
+                    pdfSource.unread( endObjectKey.substring( 6 ).getBytes() );
+                } 
+                else if(endObjectKey.trim().endsWith("endobj"))
+                {
+                    /*
+                     * Some PDF files contain junk (like ">> ", in the case of a PDF
+                     * I found which was created by Exstream Dialogue Version 5.0.039)
+                     * in which case we ignore the data before endobj and just move on
+                     */
+                    log.warn("expected='endobj' actual='" + endObjectKey + "' ");
+                }
+                else if( !pdfSource.isEOF() )
+                {
                     try
                     {
                         //It is possible that the endobj  is missing, there