You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/09/14 12:58:22 UTC
svn commit: r1624832 - in
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox:
pdfparser/BaseParser.java pdfparser/PDFParser.java pdmodel/PDDocument.java
Author: lehmi
Date: Sun Sep 14 10:58:21 2014
New Revision: 1624832
URL: http://svn.apache.org/r1624832
Log:
PDFBOX-2301: enable the usage of scratch files within the old parser
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1624832&r1=1624831&r2=1624832&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sun Sep 14 10:58:21 2014
@@ -126,7 +126,7 @@ public abstract class BaseParser
/**
* Default value of the {@link #forceParsing} flag.
*/
- protected static final boolean FORCE_PARSING =
+ public static final boolean FORCE_PARSING =
Boolean.getBoolean("org.apache.pdfbox.forceParsing");
/**
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java?rev=1624832&r1=1624831&r2=1624832&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java Sun Sep 14 10:58:21 2014
@@ -76,6 +76,8 @@ public class PDFParser extends BaseParse
*/
private File tempDirectory = null;
+ private final boolean useScratchFile;
+
/**
* Constructor.
*
@@ -99,7 +101,23 @@ public class PDFParser extends BaseParse
*/
public PDFParser(InputStream input, boolean force) throws IOException
{
+ this(input, force, false);
+ }
+
+ /**
+ * Constructor to allow control over RandomAccessFile.
+ * Also enables parser to skip corrupt objects to try and force parsing
+ * @param input The input stream that contains the PDF document.
+ * @param force When true, the parser will skip corrupt pdf objects and
+ * will continue parsing at the next object in the file
+ * @param useScratchFiles enables the usage of a scratch file if set to true
+ *
+ * @throws IOException If there is an error initializing the stream.
+ */
+ public PDFParser(InputStream input, boolean force, boolean useScratchFiles) throws IOException
+ {
super(input, force);
+ useScratchFile = useScratchFiles;
}
/**
@@ -141,11 +159,15 @@ public class PDFParser extends BaseParse
{
if( tempDirectory != null )
{
- document = new COSDocument( tempDirectory, false, true );
+ document = new COSDocument( tempDirectory, forceParsing, true );
+ }
+ else if(useScratchFile)
+ {
+ document = new COSDocument( null, forceParsing, true );
}
else
{
- document = new COSDocument();
+ document = new COSDocument(forceParsing);
}
setDocument( document );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java?rev=1624832&r1=1624831&r2=1624832&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java Sun Sep 14 10:58:21 2014
@@ -933,6 +933,23 @@ public class PDDocument implements Close
}
/**
+ * This will load a document from a url. Used for skipping corrupt pdf objects
+ *
+ * @param url The url to load the PDF from.
+ * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+ * the file
+ * @param useScratchFiles enables the usage of a scratch file if set to true
+ *
+ * @return The document that was loaded.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public static PDDocument load(URL url, boolean force, boolean useScratchFiles) throws IOException
+ {
+ return load(url.openStream(), force, useScratchFiles);
+ }
+
+ /**
* This will load a document from a file.
*
* @param filename The name of the file to load.
@@ -963,6 +980,23 @@ public class PDDocument implements Close
}
/**
+ * This will load a document from a file. Allows for skipping corrupt pdf objects
+ *
+ * @param filename The name of the file to load.
+ * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+ * the file
+ * @param useScratchFiles enables the usage of a scratch file if set to true
+ *
+ * @return The document that was loaded.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public static PDDocument load(String filename, boolean force, boolean useScratchFiles) throws IOException
+ {
+ return load(new File(filename), force, useScratchFiles);
+ }
+
+ /**
* This will load a document from a file.
*
* @param file The name of the file to load.
@@ -973,7 +1007,7 @@ public class PDDocument implements Close
*/
public static PDDocument load(File file) throws IOException
{
- return load(file, false);
+ return load(file, BaseParser.FORCE_PARSING, false);
}
/**
@@ -989,7 +1023,24 @@ public class PDDocument implements Close
*/
public static PDDocument load(File file, boolean force) throws IOException
{
- PDFParser parser = new PDFParser(new FileInputStream(file), force);
+ return load(file, force , false);
+ }
+
+ /**
+ * This will load a document from a file. Allows for skipping corrupt pdf objects
+ *
+ * @param file The name of the file to load.
+ * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+ * the file
+ * @param useScratchFiles enables the usage of a scratch file if set to true
+ *
+ * @return The document that was loaded.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public static PDDocument load(File file, boolean force, boolean useScratchFiles) throws IOException
+ {
+ PDFParser parser = new PDFParser(new FileInputStream(file), force, useScratchFiles);
parser.parse();
PDDocument doc = parser.getPDDocument();
doc.incrementalFile = file;
@@ -1007,9 +1058,7 @@ public class PDDocument implements Close
*/
public static PDDocument load(InputStream input) throws IOException
{
- PDFParser parser = new PDFParser(input);
- parser.parse();
- return parser.getPDDocument();
+ return load(input, BaseParser.FORCE_PARSING, false);
}
/**
@@ -1025,11 +1074,27 @@ public class PDDocument implements Close
*/
public static PDDocument load(InputStream input, boolean force) throws IOException
{
- PDFParser parser = new PDFParser(input, force);
+ return load(input, force, false);
+ }
+
+ /**
+ * This will load a document from an input stream. Allows for skipping corrupt pdf objects
+ *
+ * @param input The stream that contains the document.
+ * @param force When true, the parser will skip corrupt pdf objects and will continue parsing at the next object in
+ * the file
+ * @param useScratchFiles enables the usage of a scratch file if set to true
+ *
+ * @return The document that was loaded.
+ *
+ * @throws IOException If there is an error reading from the stream.
+ */
+ public static PDDocument load(InputStream input, boolean force, boolean useScratchFiles) throws IOException
+ {
+ PDFParser parser = new PDFParser(input, force, useScratchFiles);
parser.parse();
return parser.getPDDocument();
}
-
/**
* Parses PDF with non sequential parser.
*