You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/09/13 19:51:50 UTC

svn commit: r1624771 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/cos/ main/java/org/apache/pdfbox/pdfparser/ main/java/org/apache/pdfbox/pdmodel/ test/java/org/apache/pdfbox/pdfparser/

Author: lehmi
Date: Sat Sep 13 17:51:50 2014
New Revision: 1624771

URL: http://svn.apache.org/r1624771
Log:
PDFBOX-2301: added the optional usage of scratch files to the nonsequential parser

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSDocument.java Sat Sep 13 17:51:50 2014
@@ -28,6 +28,7 @@ import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.io.RandomAccessFile;
 import org.apache.pdfbox.pdfparser.NonSequentialPDFParser;
 import org.apache.pdfbox.pdfparser.PDFObjectStreamParser;
 import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface;
@@ -86,6 +87,10 @@ public class COSDocument extends COSBase
 
     private boolean isXRefStream;
     
+    private final File scratchDirectory;
+    
+    private final boolean useScratchFiles;
+    
     /**
      * Flag to skip malformed or otherwise unparseable input where possible.
      */
@@ -103,7 +108,22 @@ public class COSDocument extends COSBase
      */
     public COSDocument(boolean forceParsingValue) 
     {
-        forceParsing = forceParsingValue;
+        this(null, forceParsingValue, false);
+    }
+
+    /**
+     * Constructor that will use the given random access file for storage
+     * of the PDF streams. The client of this method is responsible for
+     * deleting the storage if necessary that this file will write to. The
+     * close method will close the file though.
+     *
+     * @param scratchFileValue the random access file to use for storage
+     * @param forceParsingValue flag to skip malformed or otherwise unparseable
+     *                     document content where possible
+     */
+    public COSDocument(boolean forceParsingValue, boolean useScratchFiles) 
+    {
+        this(null, forceParsingValue, useScratchFiles);
     }
 
     /**
@@ -115,11 +135,27 @@ public class COSDocument extends COSBase
      *                   or <code>null</code> to use the system default
      * @param forceParsingValue flag to skip malformed or otherwise unparseable
      *                     document content where possible
-     * @throws IOException if something went wrong
      */
-    public COSDocument(File scratchDir, boolean forceParsingValue) throws IOException 
+    public COSDocument(File scratchDir, boolean forceParsingValue) 
+    {
+        this(scratchDir, forceParsingValue, false);
+    }
+
+    /**
+     * Constructor that will use a temporary file in the given directory
+     * for storage of the PDF streams. The temporary file is automatically
+     * removed when this document gets closed.
+     *
+     * @param scratchDir directory for the temporary file,
+     *                   or <code>null</code> to use the system default
+     * @param forceParsingValue flag to skip malformed or otherwise unparseable
+     *                     document content where possible
+     */
+    public COSDocument(File scratchDir, boolean forceParsingValue, boolean useScratchFiles) 
     {
         forceParsing = forceParsingValue;
+        scratchDirectory = scratchDir;
+        this.useScratchFiles = useScratchFiles;
     }
 
     /**
@@ -127,7 +163,7 @@ public class COSDocument extends COSBase
      */
     public COSDocument()
     {
-        this(false);
+        this(false, false);
     }
 
     /**
@@ -140,7 +176,7 @@ public class COSDocument extends COSBase
      */
     public COSDocument(File scratchDir) throws IOException 
     {
-        this(scratchDir, false);
+        this(scratchDir, false, false);
     }
 
     /**
@@ -150,7 +186,19 @@ public class COSDocument extends COSBase
      */
     public COSStream createCOSStream()
     {
-        return new COSStream( );
+        RandomAccessFile scratchFile = null;
+        if (useScratchFiles)
+        {
+            scratchFile = createScratchFile();
+        }
+        if (scratchFile != null)
+        {
+            return new COSStream( scratchFile );
+        }
+        else
+        {
+            return new COSStream( );
+        }
     }
 
     /**
@@ -162,9 +210,36 @@ public class COSDocument extends COSBase
      */
     public COSStream createCOSStream(COSDictionary dictionary)
     {
-        return new COSStream( dictionary );
+        RandomAccessFile scratchFile = null;
+        if (useScratchFiles)
+        {
+            scratchFile = createScratchFile();
+        }
+        if (scratchFile != null)
+        {
+            return new COSStream( dictionary, scratchFile );
+        }
+        else
+        {
+            return new COSStream( dictionary );
+        }
     }
 
+    private RandomAccessFile createScratchFile()
+    {
+        RandomAccessFile buffer = null;
+        try 
+        {
+            File scratchFile = File.createTempFile("PDFBox", null, scratchDirectory);
+            scratchFile.deleteOnExit();
+            buffer = new RandomAccessFile(scratchFile, "rw");
+        }
+        catch (IOException exception)
+        {
+            LOG.error("Can't create temp file, using memory buffer instead", exception);
+        }
+        return buffer;
+    }
     /**
      * This will get the first dictionary object by type.
      *

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/cos/COSStream.java Sat Sep 13 17:51:50 2014
@@ -89,10 +89,11 @@ public class COSStream extends COSDictio
      * Constructor.  Creates a new stream with an empty dictionary.
      *
      */
-    protected COSStream( RandomAccessBuffer randomBuffer )
+    protected COSStream( RandomAccess randomBuffer )
     {
         super();
         buffer = randomBuffer;
+        
     }
 
     /**
@@ -101,7 +102,7 @@ public class COSStream extends COSDictio
      * @param dictionary The dictionary that is associated with this stream.
      * 
      */
-    protected COSStream( COSDictionary dictionary, RandomAccessBuffer randomBuffer  )
+    protected COSStream( COSDictionary dictionary, RandomAccess randomBuffer  )
     {
         super( dictionary );
         buffer = randomBuffer;

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Sat Sep 13 17:51:50 2014
@@ -203,6 +203,24 @@ public abstract class BaseParser
     }
 
     /**
+     * Returns a new instance of a COSStream.
+     * 
+     * @param dictionary the dictionary belonging to the stream
+     * @return the new COSStream
+     */
+    protected final COSStream createCOSStream(COSDictionary dictionary)
+    {
+        if (document != null)
+        {
+            return document.createCOSStream(dictionary);
+        }
+        else
+        {
+            return null;
+        }
+    }
+    
+    /**
      * Set the document for this stream.
      *
      * @param doc The current document.
@@ -410,7 +428,7 @@ public abstract class BaseParser
      */
     protected COSStream parseCOSStream( COSDictionary dic ) throws IOException
     {
-        COSStream stream = new COSStream( dic );
+        COSStream stream = createCOSStream( dic );
         OutputStream out = null;
         try
         {

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ConformingPDFParser.java Sat Sep 13 17:51:50 2014
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
+
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
@@ -31,6 +32,7 @@ import org.apache.pdfbox.cos.COSInteger;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSNumber;
 import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.cos.COSUnread;
 import org.apache.pdfbox.io.RandomAccess;

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sat Sep 13 17:51:50 2014
@@ -152,7 +152,19 @@ public class NonSequentialPDFParser exte
      */
     public NonSequentialPDFParser(String filename) throws IOException
     {
-        this(new File(filename), null);
+        this(new File(filename), null, false);
+    }
+
+    /**
+     * Constructs parser for given file using memory buffer.
+     * 
+     * @param filename the filename of the pdf to be parsed
+     * 
+     * @throws IOException If something went wrong.
+     */
+    public NonSequentialPDFParser(String filename, boolean useScratchFiles) throws IOException
+    {
+        this(new File(filename), null, useScratchFiles);
     }
 
     /**
@@ -163,47 +175,56 @@ public class NonSequentialPDFParser exte
      * 
      * @throws IOException If something went wrong.
      */
+    public NonSequentialPDFParser(File file) throws IOException
+    {
+        this(file, "", false);
+    }
+
     /**
      * Constructs parser for given file using given buffer for temporary
      * storage.
      * 
      * @param file the pdf to be parsed
-     * @param raBuf the buffer to be used for parsing
      * 
      * @throws IOException If something went wrong.
      */
-    public NonSequentialPDFParser(File file) throws IOException
+    public NonSequentialPDFParser(File file, boolean useScratchFiles) throws IOException
     {
-        this(file,  "");
+        this(file, "", useScratchFiles);
     }
 
     /**
      * Constructs parser for given file using given buffer for temporary storage.
      * 
      * @param file the pdf to be parsed
-     * @param raBuf the buffer to be used for parsing
+     * @param decryptionPassword password to be used for decryption
      * 
      * @throws IOException If something went wrong.
      */
+    public NonSequentialPDFParser(File file, String decryptionPassword)
+            throws IOException
+    {
+        this (file, decryptionPassword, false);
+    }
+
     /**
      * Constructs parser for given file using given buffer for temporary storage.
      * 
      * @param file the pdf to be parsed
-     * @param raBuf the buffer to be used for parsing
      * @param decryptionPassword password to be used for decryption
      * 
      * @throws IOException If something went wrong.
      */
-    public NonSequentialPDFParser(File file, String decryptionPassword)
+    public NonSequentialPDFParser(File file, String decryptionPassword, boolean useScratchFiles)
             throws IOException
     {
         super(EMPTY_INPUT_STREAM, false);
         pdfFile = file;
         raStream = new RandomAccessBufferedFileInputStream(pdfFile);
-        init(file, decryptionPassword);
+        init(file, decryptionPassword, useScratchFiles);
     }
 
-    private void init(File file, String decryptionPassword) throws IOException
+    private void init(File file, String decryptionPassword, boolean useScratchFiles) throws IOException
     {
         String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);
         if (eofLookupRangeStr != null)
@@ -218,11 +239,8 @@ public class NonSequentialPDFParser exte
                         + " does not contain an integer value, but: '" + eofLookupRangeStr + "'");
             }
         }
-
-        setDocument(new COSDocument(false));
-
+        setDocument(new COSDocument(false, useScratchFiles));
         pdfSource = new PushBackInputStream(raStream, 4096);
-
         password = decryptionPassword;
     }
 
@@ -234,7 +252,18 @@ public class NonSequentialPDFParser exte
      */
     public NonSequentialPDFParser(InputStream input) throws IOException
     {
-        this(input, "");
+        this(input, "", false);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input input stream representing the pdf.
+     * @throws IOException If something went wrong.
+     */
+    public NonSequentialPDFParser(InputStream input, boolean useScratchFiles) throws IOException
+    {
+        this(input, "", useScratchFiles);
     }
 
     /**
@@ -247,10 +276,23 @@ public class NonSequentialPDFParser exte
     public NonSequentialPDFParser(InputStream input, String decryptionPassword)
             throws IOException
     {
+        this(input, decryptionPassword, false);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param input input stream representing the pdf.
+     * @param decryptionPassword password to be used for decryption.
+     * @throws IOException If something went wrong.
+     */
+    public NonSequentialPDFParser(InputStream input, String decryptionPassword, boolean useScratchFiles)
+            throws IOException
+    {
         super(EMPTY_INPUT_STREAM, false);
         pdfFile = createTmpFile(input);
         raStream = new RandomAccessBufferedFileInputStream(pdfFile);
-        init(pdfFile, decryptionPassword);
+        init(pdfFile, decryptionPassword, useScratchFiles);
     }
 
     /**
@@ -1478,7 +1520,7 @@ public class NonSequentialPDFParser exte
     @Override
     protected COSStream parseCOSStream(COSDictionary dic) throws IOException
     {
-        final COSStream stream = new COSStream(dic);
+        final COSStream stream = createCOSStream(dic);
         OutputStream out = null;
         try
         {

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDDocument.java Sat Sep 13 17:51:50 2014
@@ -1041,7 +1041,21 @@ public class PDDocument implements Close
      */
     public static PDDocument loadNonSeq(File file) throws IOException
     {
-        return loadNonSeq(file, "");
+        return loadNonSeq(file, "", false);
+    }
+
+    /**
+     * Parses PDF with non sequential parser.
+     * 
+     * @param file file to be loaded
+     * 
+     * @return loaded document
+     * 
+     * @throws IOException in case of a file reading or parsing error
+     */
+    public static PDDocument loadNonSeq(File file, boolean useScratchFiles) throws IOException
+    {
+        return loadNonSeq(file, "", useScratchFiles);
     }
 
     /**
@@ -1056,7 +1070,22 @@ public class PDDocument implements Close
      */
     public static PDDocument loadNonSeq(File file, String password) throws IOException
     {
-        NonSequentialPDFParser parser = new NonSequentialPDFParser(file, password);
+        return loadNonSeq(file, password, false);
+    }
+
+    /**
+     * Parses PDF with non sequential parser.
+     * 
+     * @param file file to be loaded
+     * @param password password to be used for decryption
+     * 
+     * @return loaded document
+     * 
+     * @throws IOException in case of a file reading or parsing error
+     */
+    public static PDDocument loadNonSeq(File file, String password, boolean useScratchFiles) throws IOException
+    {
+        NonSequentialPDFParser parser = new NonSequentialPDFParser(file, password, useScratchFiles);
         parser.parse();
         return parser.getPDDocument();
     }
@@ -1072,7 +1101,21 @@ public class PDDocument implements Close
      */
     public static PDDocument loadNonSeq(InputStream input) throws IOException
     {
-        return loadNonSeq(input, "");
+        return loadNonSeq(input, "", false);
+    }
+
+    /**
+     * Parses PDF with non sequential parser.
+     * 
+     * @param input stream that contains the document.
+     * 
+     * @return loaded document
+     * 
+     * @throws IOException in case of a file reading or parsing error
+     */
+    public static PDDocument loadNonSeq(InputStream input, boolean useScratchFiles) throws IOException
+    {
+        return loadNonSeq(input, "", useScratchFiles);
     }
 
     /**
@@ -1088,7 +1131,23 @@ public class PDDocument implements Close
     public static PDDocument loadNonSeq(InputStream input, String password)
             throws IOException
     {
-        NonSequentialPDFParser parser = new NonSequentialPDFParser(input, password);
+        return loadNonSeq(input, password, false);
+    }
+
+    /**
+     * Parses PDF with non sequential parser.
+     * 
+     * @param input stream that contains the document.
+     * @param password password to be used for decryption
+     * 
+     * @return loaded document
+     * 
+     * @throws IOException in case of a file reading or parsing error
+     */
+    public static PDDocument loadNonSeq(InputStream input, String password, boolean useScratchFiles)
+            throws IOException
+    {
+        NonSequentialPDFParser parser = new NonSequentialPDFParser(input, password, useScratchFiles);
         parser.parse();
         return parser.getPDDocument();
     }

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java?rev=1624771&r1=1624770&r2=1624771&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestNonSequentialPDFParser.java Sat Sep 13 17:51:50 2014
@@ -74,7 +74,7 @@ public class TestNonSequentialPDFParser 
 	}
 
 	@Test
-	public void testNonSequentialPDFParserFileRandomAccess() throws IOException {
+	public void testNonSequentialPDFParserFile() throws IOException {
 		NonSequentialPDFParser nsp = new NonSequentialPDFParser(new File(PATH_OF_PDF));
 		executeParserTest(nsp);
 	}
@@ -85,6 +85,23 @@ public class TestNonSequentialPDFParser 
 		executeParserTest(nsp);
 	}
 
+    @Test
+    public void testNonSequentialPDFParserStringScratchFile() throws Exception {
+        NonSequentialPDFParser nsp = new NonSequentialPDFParser(PATH_OF_PDF, true);
+        executeParserTest(nsp);
+    }
+
+    @Test
+    public void testNonSequentialPDFParserFileScratchFile() throws IOException {
+        NonSequentialPDFParser nsp = new NonSequentialPDFParser(new File(PATH_OF_PDF), true);
+        executeParserTest(nsp);
+    }
+
+    @Test
+    public void testNonSequentialPDFParserInputStreamScratchFile() throws IOException {
+        NonSequentialPDFParser nsp = new NonSequentialPDFParser(new FileInputStream(PATH_OF_PDF), true);
+        executeParserTest(nsp);
+    }
 	
 	private void executeParserTest(NonSequentialPDFParser nsp) throws IOException {
 	  nsp.parse();