You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/09/14 12:58:22 UTC
svn commit: r1624832 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: pdfparser/BaseParser.java pdfparser/PDFParser.java pdmodel/PDDocument.java

Author: lehmi
Date: Sun Sep 14 10:58:21 2014
New Revision: 1624832

URL: http://svn.apache.org/r1624832
Log:
PDFBOX-2301: enable the usage of scratch files within the old parser

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1624832&r1=1624831&r2=1624832&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Sep 14 10:58:21 2014
@@ -126,7 +126,7 @@ public abstract class BaseParser
     /**
      * Default value of the {@link #forceParsing} flag.
      */
-    protected static final boolean FORCE_PARSING =
+    public static final boolean FORCE_PARSING =
         Boolean.getBoolean("org.apache.pdfbox.forceParsing");
 
     /**

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1624832&r1=1624831&r2=1624832&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Sun Sep 14 10:58:21 2014
@@ -76,6 +76,8 @@ public class PDFParser extends BaseParse
      */
     private File tempDirectory = null;
 
+    private final boolean useScratchFile;
+
     /**
      * Constructor.
      *
@@ -99,7 +101,23 @@ public class PDFParser extends BaseParse
      */
     public PDFParser(InputStream input, boolean force) throws IOException 
     {
+        this(input, force, false);
+    }
+
+    /**
+     * Constructor to allow control over RandomAccessFile.
+     * Also enables parser to skip corrupt objects to try and force parsing
+     * @param input The input stream that contains the PDF document.
+     * @param force When true, the parser will skip corrupt pdf objects and
+     * will continue parsing at the next object in the file
+     * @param useScratchFiles enables the usage of a scratch file if set to true
+     *
+     * @throws IOException If there is an error initializing the stream.
+     */
+    public PDFParser(InputStream input, boolean force, boolean useScratchFiles) throws IOException 
+    {
         super(input, force);
+        useScratchFile = useScratchFiles;
     }
 
     /**
@@ -141,11 +159,15 @@ public class PDFParser extends BaseParse
         {
             if( tempDirectory != null )
             {
-                document = new COSDocument( tempDirectory, false, true );
+                document = new COSDocument( tempDirectory, forceParsing, true );
+            }
+            else if(useScratchFile)
+            {
+                document = new COSDocument( null, forceParsing, true );
             }
             else
             {
-                document = new COSDocument();
+                document = new COSDocument(forceParsing);
             }
             setDocument( document );
 

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java?rev=1624832&r1=1624831&r2=1624832&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java Sun Sep 14 10:58:21 2014
@@ -933,6 +933,23 @@ public class PDDocument implements Close
     }
 
     /**
+     * This will load a document from a url. Used for skipping corrupt pdf objects
+     * 
+     * @param url The url to load the PDF from.
+     * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+     *            the file
+     * @param useScratchFiles enables the usage of a scratch file if set to true
+     * 
+     * @return The document that was loaded.
+     * 
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public static PDDocument load(URL url, boolean force, boolean useScratchFiles) throws IOException
+    {
+        return load(url.openStream(), force, useScratchFiles);
+    }
+
+    /**
      * This will load a document from a file.
      * 
      * @param filename The name of the file to load.
@@ -963,6 +980,23 @@ public class PDDocument implements Close
     }
 
     /**
+     * This will load a document from a file. Allows for skipping corrupt pdf objects
+     * 
+     * @param filename The name of the file to load.
+     * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+     *            the file
+     * @param useScratchFiles enables the usage of a scratch file if set to true
+     * 
+     * @return The document that was loaded.
+     * 
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public static PDDocument load(String filename, boolean force, boolean useScratchFiles) throws IOException
+    {
+        return load(new File(filename), force, useScratchFiles);
+    }
+
+    /**
      * This will load a document from a file.
      * 
      * @param file The name of the file to load.
@@ -973,7 +1007,7 @@ public class PDDocument implements Close
      */
     public static PDDocument load(File file) throws IOException
     {
-        return load(file, false);
+        return load(file, BaseParser.FORCE_PARSING, false);
     }
 
     /**
@@ -989,7 +1023,24 @@ public class PDDocument implements Close
      */
     public static PDDocument load(File file, boolean force) throws IOException
     {
-        PDFParser parser = new PDFParser(new FileInputStream(file), force);
+        return load(file, force , false);
+    }
+
+    /**
+     * This will load a document from a file. Allows for skipping corrupt pdf objects
+     *
+     * @param file The name of the file to load.
+     * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+     *            the file
+     * @param useScratchFiles enables the usage of a scratch file if set to true
+     *
+     * @return The document that was loaded.
+     *
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public static PDDocument load(File file, boolean force, boolean useScratchFiles) throws IOException
+    {
+        PDFParser parser = new PDFParser(new FileInputStream(file), force, useScratchFiles);
         parser.parse();
         PDDocument doc = parser.getPDDocument();
         doc.incrementalFile = file;
@@ -1007,9 +1058,7 @@ public class PDDocument implements Close
      */
     public static PDDocument load(InputStream input) throws IOException
     {
-        PDFParser parser = new PDFParser(input);
-        parser.parse();
-        return parser.getPDDocument();
+        return load(input, BaseParser.FORCE_PARSING, false);
     }
 
     /**
@@ -1025,11 +1074,27 @@ public class PDDocument implements Close
      */
     public static PDDocument load(InputStream input, boolean force) throws IOException
     {
-        PDFParser parser = new PDFParser(input, force);
+        return load(input, force, false);
+    }
+
+    /**
+     * This will load a document from an input stream. Allows for skipping corrupt pdf objects
+     * 
+     * @param input The stream that contains the document.
+     * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+     *            the file
+     * @param useScratchFiles enables the usage of a scratch file if set to true
+     * 
+     * @return The document that was loaded.
+     * 
+     * @throws IOException If there is an error reading from the stream.
+     */
+    public static PDDocument load(InputStream input, boolean force, boolean useScratchFiles) throws IOException
+    {
+        PDFParser parser = new PDFParser(input, force, useScratchFiles);
         parser.parse();
         return parser.getPDDocument();
     }
-
     /**
      * Parses PDF with non sequential parser.
      *