You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/09/13 19:51:50 UTC
svn commit: r1624771 - in /pdfbox/trunk/pdfbox/src:
main/java/org/apache/pdfbox/cos/ main/java/org/apache/pdfbox/pdfparser/
main/java/org/apache/pdfbox/pdmodel/ test/java/org/apache/pdfbox/pdfparser/
Author: lehmi
Date: Sat Sep 13 17:51:50 2014
New Revision: 1624771
URL: http://svn.apache.org/r1624771
Log:
PDFBOX-2301: added the optional usage of scratch files to the nonsequential parser
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java Sat Sep 13 17:51:50 2014
@@ -28,6 +28,7 @@ import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.NonSequentialPDFParser;
import org.apache.pdfbox.pdfparser.PDFObjectStreamParser;
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface;
@@ -86,6 +87,10 @@ public class COSDocument extends COSBase
private boolean isXRefStream;
+ private final File scratchDirectory;
+
+ private final boolean useScratchFiles;
+
/**
* Flag to skip malformed or otherwise unparseable input where possible.
*/
@@ -103,7 +108,22 @@ public class COSDocument extends COSBase
*/
public COSDocument(boolean forceParsingValue)
{
- forceParsing = forceParsingValue;
+ this(null, forceParsingValue, false);
+ }
+
+ /**
+ * Constructor that will use the given random access file for storage
+ * of the PDF streams. The client of this method is responsible for
+ * deleting the storage if necessary that this file will write to. The
+ * close method will close the file though.
+ *
+ * @param scratchFileValue the random access file to use for storage
+ * @param forceParsingValue flag to skip malformed or otherwise unparseable
+ * document content where possible
+ */
+ public COSDocument(boolean forceParsingValue, boolean useScratchFiles)
+ {
+ this(null, forceParsingValue, useScratchFiles);
}
/**
@@ -115,11 +135,27 @@ public class COSDocument extends COSBase
* or <code>null</code> to use the system default
* @param forceParsingValue flag to skip malformed or otherwise unparseable
* document content where possible
- * @throws IOException if something went wrong
*/
- public COSDocument(File scratchDir, boolean forceParsingValue) throws IOException
+ public COSDocument(File scratchDir, boolean forceParsingValue)
+ {
+ this(scratchDir, forceParsingValue, false);
+ }
+
+ /**
+ * Constructor that will use a temporary file in the given directory
+ * for storage of the PDF streams. The temporary file is automatically
+ * removed when this document gets closed.
+ *
+ * @param scratchDir directory for the temporary file,
+ * or <code>null</code> to use the system default
+ * @param forceParsingValue flag to skip malformed or otherwise unparseable
+ * document content where possible
+ */
+ public COSDocument(File scratchDir, boolean forceParsingValue, boolean useScratchFiles)
{
forceParsing = forceParsingValue;
+ scratchDirectory = scratchDir;
+ this.useScratchFiles = useScratchFiles;
}
/**
@@ -127,7 +163,7 @@ public class COSDocument extends COSBase
*/
public COSDocument()
{
- this(false);
+ this(false, false);
}
/**
@@ -140,7 +176,7 @@ public class COSDocument extends COSBase
*/
public COSDocument(File scratchDir) throws IOException
{
- this(scratchDir, false);
+ this(scratchDir, false, false);
}
/**
@@ -150,7 +186,19 @@ public class COSDocument extends COSBase
*/
public COSStream createCOSStream()
{
- return new COSStream( );
+ RandomAccessFile scratchFile = null;
+ if (useScratchFiles)
+ {
+ scratchFile = createScratchFile();
+ }
+ if (scratchFile != null)
+ {
+ return new COSStream( scratchFile );
+ }
+ else
+ {
+ return new COSStream( );
+ }
}
/**
@@ -162,9 +210,36 @@ public class COSDocument extends COSBase
*/
public COSStream createCOSStream(COSDictionary dictionary)
{
- return new COSStream( dictionary );
+ RandomAccessFile scratchFile = null;
+ if (useScratchFiles)
+ {
+ scratchFile = createScratchFile();
+ }
+ if (scratchFile != null)
+ {
+ return new COSStream( dictionary, scratchFile );
+ }
+ else
+ {
+ return new COSStream( dictionary );
+ }
}
+ private RandomAccessFile createScratchFile()
+ {
+ RandomAccessFile buffer = null;
+ try
+ {
+ File scratchFile = File.createTempFile("PDFBox", null, scratchDirectory);
+ scratchFile.deleteOnExit();
+ buffer = new RandomAccessFile(scratchFile, "rw");
+ }
+ catch (IOException exception)
+ {
+ LOG.error("Can't create temp file, using memory buffer instead", exception);
+ }
+ return buffer;
+ }
/**
* This will get the first dictionary object by type.
*
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java Sat Sep 13 17:51:50 2014
@@ -89,10 +89,11 @@ public class COSStream extends COSDictio
* Constructor. Creates a new stream with an empty dictionary.
*
*/
- protected COSStream( RandomAccessBuffer randomBuffer )
+ protected COSStream( RandomAccess randomBuffer )
{
super();
buffer = randomBuffer;
+
}
/**
@@ -101,7 +102,7 @@ public class COSStream extends COSDictio
* @param dictionary The dictionary that is associated with this stream.
*
*/
- protected COSStream( COSDictionary dictionary, RandomAccessBuffer randomBuffer )
+ protected COSStream( COSDictionary dictionary, RandomAccess randomBuffer )
{
super( dictionary );
buffer = randomBuffer;
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sat Sep 13 17:51:50 2014
@@ -203,6 +203,24 @@ public abstract class BaseParser
}
/**
+ * Returns a new instance of a COSStream.
+ *
+ * @param dictionary the dictionary belonging to the stream
+ * @return the new COSStream
+ */
+ protected final COSStream createCOSStream(COSDictionary dictionary)
+ {
+ if (document != null)
+ {
+ return document.createCOSStream(dictionary);
+ }
+ else
+ {
+ return null;
+ }
+ }
+
+ /**
* Set the document for this stream.
*
* @param doc The current document.
@@ -410,7 +428,7 @@ public abstract class BaseParser
*/
protected COSStream parseCOSStream( COSDictionary dic ) throws IOException
{
- COSStream stream = new COSStream( dic );
+ COSStream stream = createCOSStream( dic );
OutputStream out = null;
try
{
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java Sat Sep 13 17:51:50 2014
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
+
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
@@ -31,6 +32,7 @@ import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.cos.COSUnread;
import org.apache.pdfbox.io.RandomAccess;
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sat Sep 13 17:51:50 2014
@@ -152,7 +152,19 @@ public class NonSequentialPDFParser exte
*/
public NonSequentialPDFParser(String filename) throws IOException
{
- this(new File(filename), null);
+ this(new File(filename), null, false);
+ }
+
+ /**
+ * Constructs parser for given file using memory buffer.
+ *
+ * @param filename the filename of the pdf to be parsed
+ *
+ * @throws IOException If something went wrong.
+ */
+ public NonSequentialPDFParser(String filename, boolean useScratchFiles) throws IOException
+ {
+ this(new File(filename), null, useScratchFiles);
}
/**
@@ -163,47 +175,56 @@ public class NonSequentialPDFParser exte
*
* @throws IOException If something went wrong.
*/
+ public NonSequentialPDFParser(File file) throws IOException
+ {
+ this(file, "", false);
+ }
+
/**
* Constructs parser for given file using given buffer for temporary
* storage.
*
* @param file the pdf to be parsed
- * @param raBuf the buffer to be used for parsing
*
* @throws IOException If something went wrong.
*/
- public NonSequentialPDFParser(File file) throws IOException
+ public NonSequentialPDFParser(File file, boolean useScratchFiles) throws IOException
{
- this(file, "");
+ this(file, "", useScratchFiles);
}
/**
* Constructs parser for given file using given buffer for temporary storage.
*
* @param file the pdf to be parsed
- * @param raBuf the buffer to be used for parsing
+ * @param decryptionPassword password to be used for decryption
*
* @throws IOException If something went wrong.
*/
+ public NonSequentialPDFParser(File file, String decryptionPassword)
+ throws IOException
+ {
+ this (file, decryptionPassword, false);
+ }
+
/**
* Constructs parser for given file using given buffer for temporary storage.
*
* @param file the pdf to be parsed
- * @param raBuf the buffer to be used for parsing
* @param decryptionPassword password to be used for decryption
*
* @throws IOException If something went wrong.
*/
- public NonSequentialPDFParser(File file, String decryptionPassword)
+ public NonSequentialPDFParser(File file, String decryptionPassword, boolean useScratchFiles)
throws IOException
{
super(EMPTY_INPUT_STREAM, false);
pdfFile = file;
raStream = new RandomAccessBufferedFileInputStream(pdfFile);
- init(file, decryptionPassword);
+ init(file, decryptionPassword, useScratchFiles);
}
- private void init(File file, String decryptionPassword) throws IOException
+ private void init(File file, String decryptionPassword, boolean useScratchFiles) throws IOException
{
String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);
if (eofLookupRangeStr != null)
@@ -218,11 +239,8 @@ public class NonSequentialPDFParser exte
+ " does not contain an integer value, but: '" + eofLookupRangeStr + "'");
}
}
-
- setDocument(new COSDocument(false));
-
+ setDocument(new COSDocument(false, useScratchFiles));
pdfSource = new PushBackInputStream(raStream, 4096);
-
password = decryptionPassword;
}
@@ -234,7 +252,18 @@ public class NonSequentialPDFParser exte
*/
public NonSequentialPDFParser(InputStream input) throws IOException
{
- this(input, "");
+ this(input, "", false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param input input stream representing the pdf.
+ * @throws IOException If something went wrong.
+ */
+ public NonSequentialPDFParser(InputStream input, boolean useScratchFiles) throws IOException
+ {
+ this(input, "", useScratchFiles);
}
/**
@@ -247,10 +276,23 @@ public class NonSequentialPDFParser exte
public NonSequentialPDFParser(InputStream input, String decryptionPassword)
throws IOException
{
+ this(input, decryptionPassword, false);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param input input stream representing the pdf.
+ * @param decryptionPassword password to be used for decryption.
+ * @throws IOException If something went wrong.
+ */
+ public NonSequentialPDFParser(InputStream input, String decryptionPassword, boolean useScratchFiles)
+ throws IOException
+ {
super(EMPTY_INPUT_STREAM, false);
pdfFile = createTmpFile(input);
raStream = new RandomAccessBufferedFileInputStream(pdfFile);
- init(pdfFile, decryptionPassword);
+ init(pdfFile, decryptionPassword, useScratchFiles);
}
/**
@@ -1478,7 +1520,7 @@ public class NonSequentialPDFParser exte
@Override
protected COSStream parseCOSStream(COSDictionary dic) throws IOException
{
- final COSStream stream = new COSStream(dic);
+ final COSStream stream = createCOSStream(dic);
OutputStream out = null;
try
{
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java Sat Sep 13 17:51:50 2014
@@ -1041,7 +1041,21 @@ public class PDDocument implements Close
*/
public static PDDocument loadNonSeq(File file) throws IOException
{
- return loadNonSeq(file, "");
+ return loadNonSeq(file, "", false);
+ }
+
+ /**
+ * Parses PDF with non sequential parser.
+ *
+ * @param file file to be loaded
+ *
+ * @return loaded document
+ *
+ * @throws IOException in case of a file reading or parsing error
+ */
+ public static PDDocument loadNonSeq(File file, boolean useScratchFiles) throws IOException
+ {
+ return loadNonSeq(file, "", useScratchFiles);
}
/**
@@ -1056,7 +1070,22 @@ public class PDDocument implements Close
*/
public static PDDocument loadNonSeq(File file, String password) throws IOException
{
- NonSequentialPDFParser parser = new NonSequentialPDFParser(file, password);
+ return loadNonSeq(file, password, false);
+ }
+
+ /**
+ * Parses PDF with non sequential parser.
+ *
+ * @param file file to be loaded
+ * @param password password to be used for decryption
+ *
+ * @return loaded document
+ *
+ * @throws IOException in case of a file reading or parsing error
+ */
+ public static PDDocument loadNonSeq(File file, String password, boolean useScratchFiles) throws IOException
+ {
+ NonSequentialPDFParser parser = new NonSequentialPDFParser(file, password, useScratchFiles);
parser.parse();
return parser.getPDDocument();
}
@@ -1072,7 +1101,21 @@ public class PDDocument implements Close
*/
public static PDDocument loadNonSeq(InputStream input) throws IOException
{
- return loadNonSeq(input, "");
+ return loadNonSeq(input, "", false);
+ }
+
+ /**
+ * Parses PDF with non sequential parser.
+ *
+ * @param input stream that contains the document.
+ *
+ * @return loaded document
+ *
+ * @throws IOException in case of a file reading or parsing error
+ */
+ public static PDDocument loadNonSeq(InputStream input, boolean useScratchFiles) throws IOException
+ {
+ return loadNonSeq(input, "", useScratchFiles);
}
/**
@@ -1088,7 +1131,23 @@ public class PDDocument implements Close
public static PDDocument loadNonSeq(InputStream input, String password)
throws IOException
{
- NonSequentialPDFParser parser = new NonSequentialPDFParser(input, password);
+ return loadNonSeq(input, password, false);
+ }
+
+ /**
+ * Parses PDF with non sequential parser.
+ *
+ * @param input stream that contains the document.
+ * @param password password to be used for decryption
+ *
+ * @return loaded document
+ *
+ * @throws IOException in case of a file reading or parsing error
+ */
+ public static PDDocument loadNonSeq(InputStream input, String password, boolean useScratchFiles)
+ throws IOException
+ {
+ NonSequentialPDFParser parser = new NonSequentialPDFParser(input, password, useScratchFiles);
parser.parse();
return parser.getPDDocument();
}
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java Sat Sep 13 17:51:50 2014
@@ -74,7 +74,7 @@ public class TestNonSequentialPDFParser
}
@Test
- public void testNonSequentialPDFParserFileRandomAccess() throws IOException {
+ public void testNonSequentialPDFParserFile() throws IOException {
NonSequentialPDFParser nsp = new NonSequentialPDFParser(new File(PATH_OF_PDF));
executeParserTest(nsp);
}
@@ -85,6 +85,23 @@ public class TestNonSequentialPDFParser
executeParserTest(nsp);
}
+ @Test
+ public void testNonSequentialPDFParserStringScratchFile() throws Exception {
+ NonSequentialPDFParser nsp = new NonSequentialPDFParser(PATH_OF_PDF, true);
+ executeParserTest(nsp);
+ }
+
+ @Test
+ public void testNonSequentialPDFParserFileScratchFile() throws IOException {
+ NonSequentialPDFParser nsp = new NonSequentialPDFParser(new File(PATH_OF_PDF), true);
+ executeParserTest(nsp);
+ }
+
+ @Test
+ public void testNonSequentialPDFParserInputStreamScratchFile() throws IOException {
+ NonSequentialPDFParser nsp = new NonSequentialPDFParser(new FileInputStream(PATH_OF_PDF), true);
+ executeParserTest(nsp);
+ }
private void executeParserTest(NonSequentialPDFParser nsp) throws IOException {
nsp.parse();