You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ad...@apache.org on 2011/01/20 18:42:18 UTC

svn commit: r1061410 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java

Author: adam
Date: Thu Jan 20 17:42:17 2011
New Revision: 1061410

URL: http://svn.apache.org/viewvc?rev=1061410&view=rev
Log:
PDFBOX-917: Read non-conforming PDFs (attached) without throwing java.io.IOException: expected='endobj' org.apache.pdfbox.io.PushBackInputStream

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1061410&r1=1061409&r2=1061410&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Thu Jan 20 17:42:17 2011
@@ -17,9 +17,8 @@
 package org.apache.pdfbox.pdfparser;
 
 import java.io.File;
-import java.io.InputStream;
 import java.io.IOException;
-
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -34,11 +33,8 @@ import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.exceptions.WrappedIOException;
 import org.apache.pdfbox.io.RandomAccess;
-
 import org.apache.pdfbox.pdmodel.PDDocument;
-
 import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
-
 import org.apache.pdfbox.persistence.util.COSObjectKey;
 
 /**
@@ -125,6 +121,18 @@ public class PDFParser extends BaseParse
     }
 
     /**
+     * Returns true if parsing should be continued. By default, forceParsing is returned. 
+     * This can be overridden to add application specific handling (for example to stop 
+     * parsing when the number of exceptions thrown exceed a certain number).
+     * 
+     * @param e The exception if vailable. Can be null if there is no exception available
+     */
+    protected boolean isContinueOnError(Exception e)
+    {
+        return forceParsing;
+    }
+    
+    /**
      * This will parse the stream and populate the COSDocument object.  This will close
      * the stream when it is done parsing.
      *
@@ -173,7 +181,7 @@ public class PDFParser extends BaseParse
                     }
                     catch(IOException e)
                     {
-                        if(forceParsing)
+                        if(isContinueOnError(e))
                         {
                             /*
                              * Warning is sent to the PDFBox.log and to the Console that
@@ -508,7 +516,11 @@ public class PDFParser extends BaseParse
                 //" genNumber=" + genNum + " key='" + objectKey + "'" );
                 if( !objectKey.equals( "obj" ) )
                 {
-                    throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
+                    if (!isContinueOnError(null) || !objectKey.equals("o")) {
+                        throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
+                    }
+                    //assume that "o" was meant to be "obj" (this is a workaround for 
+                    // PDFBOX-773 attached PDF Andersens_Fairy_Tales.pdf). 
                 }
             }
             else
@@ -577,38 +589,10 @@ public class PDFParser extends BaseParse
                 }
                 else if( !pdfSource.isEOF() )
                 {
-                    try
-                    {
-                        //It is possible that the endobj  is missing, there
-                        //are several PDFs out there that do that so skip it and move on.
-                        Float.parseFloat( endObjectKey );
-                        pdfSource.unread( SPACE_BYTE );
-                        pdfSource.unread( endObjectKey.getBytes() );
-                    }
-                    catch( NumberFormatException e )
-                    {
-                        //we will try again incase there was some garbage which
-                        //some writers will leave behind.
-                        String secondEndObjectKey = readString();
-                        if( !secondEndObjectKey.equals( "endobj" ) )
-                        {
-                            if( isClosing() )
-                            {
-                                //found a case with 17506.pdf object 41 that was like this
-                                //41 0 obj [/Pattern /DeviceGray] ] endobj
-                                //notice the second array close, here we are reading it
-                                //and ignoring and attempting to continue
-                                pdfSource.read();
-                            }
-                            skipSpaces();
-                            String thirdPossibleEndObj = readString();
-                            if( !thirdPossibleEndObj.equals( "endobj" ) )
-                            {
-                                throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
-                                    "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
-                            }
-                        }
-                    }
+                    //It is possible that the endobj is missing, there
+                    //are several PDFs out there that do that so. Unread
+                    //and assume that endobj was missing
+                    pdfSource.unread( endObjectKey.getBytes() );
                 }
             }
             skipSpaces();