You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2012/05/16 11:18:16 UTC
svn commit: r1339077 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: ExtractImages.java ExtractText.java PDFSplit.java PDFToImage.java WriteDecodedDoc.java

Author: lehmi
Date: Wed May 16 09:18:16 2012
New Revision: 1339077

URL: http://svn.apache.org/viewvc?rev=1339077&view=rev
Log:
PDFBOX-1311: added "-noSeq" command line parameter

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java Wed May 16 09:18:16 2012
@@ -46,6 +46,7 @@ public class ExtractImages
     private static final String PASSWORD = "-password";
     private static final String PREFIX = "-prefix";
     private static final String ADDKEY = "-addkey";
+    private static final String NONSEQ = "-nonSeq";
 
     private ExtractImages()
     {
@@ -76,6 +77,7 @@ public class ExtractImages
             String password = "";
             String prefix = null;
             boolean addKey = false;
+            boolean useNonSeqParser = false;
             for( int i=0; i<args.length; i++ )
             {
                 if( args[i].equals( PASSWORD ) )
@@ -100,6 +102,10 @@ public class ExtractImages
                 {
                     addKey = true;
                 }
+                else if( args[i].equals( NONSEQ ) )
+                {
+                    useNonSeqParser = true;
+                }
                 else
                 {
                     if( pdfFile == null )
@@ -123,20 +129,25 @@ public class ExtractImages
 
                 try
                 {
-                    document = PDDocument.load( pdfFile );
-
-                    if( document.isEncrypted() )
+                    if (useNonSeqParser)
                     {
-
-                        StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
-                        document.openProtection(spm);
-                        AccessPermission ap = document.getCurrentAccessPermission();
-
-
-                        if( ! ap.canExtractContent() )
+                        document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+                    }
+                    else
+                    {
+                        document = PDDocument.load( pdfFile );
+    
+                        if( document.isEncrypted() )
                         {
-                            throw new IOException(
-                                "Error: You do not have permission to extract images." );
+                            StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
+                            document.openProtection(spm);
+                            AccessPermission ap = document.getCurrentAccessPermission();
+    
+                            if( ! ap.canExtractContent() )
+                            {
+                                throw new IOException(
+                                    "Error: You do not have permission to extract images." );
+                            }
                         }
                     }
 
@@ -224,6 +235,7 @@ public class ExtractImages
             "  -password  <password>        Password to decrypt document\n" +
             "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n" +
             "  -addkey                      add the internal image key to the file name\n" +
+            "  -nonSeq                      Enables the new non-sequential parser\n" +
             "  <PDF file>                   The PDF document to use\n"
             );
         System.exit( 1 );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java Wed May 16 09:18:16 2012
@@ -21,8 +21,6 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
-import java.net.MalformedURLException;
-import java.net.URL;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
@@ -47,8 +45,11 @@ public class ExtractText
     private static final String SORT = "-sort";
     private static final String IGNORE_BEADS = "-ignoreBeads";
     private static final String DEBUG = "-debug";
-    private static final String HTML = "-html";  // jjb - added simple HTML output
-    private static final String FORCE = "-force"; //enables pdfbox to skip corrupt objects
+    // jjb - added simple HTML output
+    private static final String HTML = "-html";  
+    // enables pdfbox to skip corrupt objects
+    private static final String FORCE = "-force"; 
+    private static final String NONSEQ = "-nonSeq";
 
     /*
      * debug flag
@@ -75,7 +76,13 @@ public class ExtractText
         ExtractText extractor = new ExtractText();
         extractor.startExtraction(args);
     }
-
+    /**
+     * Starts the text extraction.
+     *  
+     * @param args the commandline arguments.
+     * 
+     * @throws Exception if something went wrong.
+     */
     public void startExtraction( String[] args ) throws Exception
     {
         boolean toConsole = false;
@@ -83,6 +90,7 @@ public class ExtractText
         boolean force = false;
         boolean sort = false;
         boolean separateBeads = true;
+        boolean useNonSeqParser = false; 
         String password = "";
         String encoding = null;
         String pdfFile = null;
@@ -154,6 +162,10 @@ public class ExtractText
             {
                 force = true;
             }
+            else if( args[i].equals( NONSEQ ) )
+            {
+                useNonSeqParser = true;
+            }
             else
             {
                 if( pdfFile == null )
@@ -179,39 +191,31 @@ public class ExtractText
             try
             {
                 long startTime = startProcessing("Loading PDF "+pdfFile);
-                try
+                if( outputFile == null && pdfFile.length() >4 )
                 {
-                    //basically try to load it from a url first and if the URL
-                    //is not recognized then try to load it from the file system.
-                    URL url = new URL( pdfFile );
-                    document = PDDocument.load(url, force);
-                    String fileName = url.getFile();
-                    if( outputFile == null && fileName.length() >4 )
-                    {
-                        outputFile = new File( fileName.substring( 0, fileName.length() -4 ) + ext ).getName();
-                    }
+                    outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
                 }
-                catch( MalformedURLException e )
+                if (useNonSeqParser) 
+                {
+                    document = PDDocument.loadNonSeq(new File( pdfFile ), null, password);
+                }
+                else
                 {
                     document = PDDocument.load(pdfFile, force);
-                    if( outputFile == null && pdfFile.length() >4 )
+                    if( document.isEncrypted() )
                     {
-                        outputFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ext;
+                        StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
+                        document.openProtection( sdm );
+                        AccessPermission ap = document.getCurrentAccessPermission();
+
+                        if( ! ap.canExtractContent() )
+                        {
+                            throw new IOException( "You do not have permission to extract text" );
+                        }
                     }
                 }
                 stopProcessing("Time for loading: ", startTime);
 
-                if( document.isEncrypted() )
-                {
-                    StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
-                    document.openProtection( sdm );
-                    AccessPermission ap = document.getCurrentAccessPermission();
-
-                    if( ! ap.canExtractContent() )
-                    {
-                        throw new IOException( "You do not have permission to extract text" );
-                    }
-                }
 
                 if ((encoding == null) && (toHTML))
                 {
@@ -253,6 +257,11 @@ public class ExtractText
                 stripper.setEndPage( endPage );
 
                 startTime = startProcessing("Starting text extraction");
+                if (debug) 
+                {
+                    System.err.println("Writing to "+outputFile);
+                }
+                
                 stripper.writeText( document, output );
                 stopProcessing("Time for extraction: ", startTime);
             }
@@ -270,7 +279,8 @@ public class ExtractText
         }
     }
 
-    private long startProcessing(String message) {
+    private long startProcessing(String message) 
+    {
         if (debug) 
         {
             System.err.println(message);
@@ -278,7 +288,8 @@ public class ExtractText
         return System.currentTimeMillis();
     }
     
-    private void stopProcessing(String message, long startTime) {
+    private void stopProcessing(String message, long startTime) 
+    {
         if (debug)
         {
             long stopTime = System.currentTimeMillis();
@@ -303,6 +314,7 @@ public class ExtractText
             "  -debug                       Enables debug output about the time consumption of every stage\n" +
             "  -startPage <number>          The first page to start extraction(1 based)\n" +
             "  -endPage <number>            The last page to extract(inclusive)\n" +
+            "  -nonSeq                      Enables the new non-sequential parser\n" +
             "  <PDF file>                   The PDF document to use\n" +
             "  [Text File]                  The file to write the text to\n"
             );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java Wed May 16 09:18:16 2012
@@ -16,9 +16,8 @@
  */
 package org.apache.pdfbox;
 
+import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.FileInputStream;
 import java.io.FileOutputStream;
 
 import java.util.List;
@@ -26,8 +25,6 @@ import java.util.List;
 import org.apache.pdfbox.exceptions.InvalidPasswordException;
 import org.apache.pdfbox.exceptions.COSVisitorException;
 
-import org.apache.pdfbox.pdfparser.PDFParser;
-
 import org.apache.pdfbox.pdmodel.PDDocument;
 
 import org.apache.pdfbox.pdfwriter.COSWriter;
@@ -47,6 +44,7 @@ public class PDFSplit
     private static final String SPLIT = "-split";
     private static final String START_PAGE = "-startPage";
     private static final String END_PAGE = "-endPage";
+    private static final String NONSEQ = "-nonSeq";
 
     private PDFSplit()
     {
@@ -70,7 +68,7 @@ public class PDFSplit
         String split = null;
         String startPage = null;
         String endPage = null;
-        
+        boolean useNonSeqParser = false;
         Splitter splitter = new Splitter();
         String pdfFile = null;
         for( int i=0; i<args.length; i++ )
@@ -111,6 +109,10 @@ public class PDFSplit
                 }
                 endPage = args[i];
             }
+            else if( args[i].equals( NONSEQ ) )
+            {
+                useNonSeqParser = true;
+            }
             else
             {
                 if( pdfFile == null )
@@ -126,33 +128,36 @@ public class PDFSplit
         }
         else
         {
-
-            InputStream input = null;
             PDDocument document = null;
             List<PDDocument> documents = null;
             try
             {
-                input = new FileInputStream( pdfFile );
-                document = parseDocument( input );
-
-                if( document.isEncrypted() )
+                if (useNonSeqParser) 
                 {
-                    try
-                    {
-                        document.decrypt( password );
-                    }
-                    catch( InvalidPasswordException e )
+                    document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+                }
+                else
+                {
+                    document = PDDocument.load(pdfFile);
+                    if( document.isEncrypted() )
                     {
-                        if( args.length == 4 )//they supplied the wrong password
+                        try
                         {
-                            System.err.println( "Error: The supplied password is incorrect." );
-                            System.exit( 2 );
+                            document.decrypt( password );
                         }
-                        else
+                        catch( InvalidPasswordException e )
                         {
-                            //they didn't supply a password and the default of "" was wrong.
-                            System.err.println( "Error: The document is encrypted." );
-                            usage();
+                            if( args.length == 4 )//they supplied the wrong password
+                            {
+                                System.err.println( "Error: The supplied password is incorrect." );
+                                System.exit( 2 );
+                            }
+                            else
+                            {
+                                //they didn't supply a password and the default of "" was wrong.
+                                System.err.println( "Error: The document is encrypted." );
+                                usage();
+                            }
                         }
                     }
                 }
@@ -199,10 +204,6 @@ public class PDFSplit
             }
             finally
             {
-                if( input != null )
-                {
-                    input.close();
-                }
                 if( document != null )
                 {
                     document.close();
@@ -240,22 +241,6 @@ public class PDFSplit
     }
 
     /**
-     * This will parse a document.
-     *
-     * @param input The input stream for the document.
-     *
-     * @return The document.
-     *
-     * @throws IOException If there is an error parsing the document.
-     */
-    private static PDDocument parseDocument( InputStream input )throws IOException
-    {
-        PDFParser parser = new PDFParser( input );
-        parser.parse();
-        return parser.getPDDocument();
-    }
-
-    /**
      * This will print the usage requirements and exit.
      */
     private static void usage()
@@ -265,6 +250,7 @@ public class PDFSplit
             "  -split     <integer>   split after this many pages (default 1, if startPage and endPage are unset)\n"+
             "  -startPage <integer>   start page\n" +
             "  -endPage   <integer>   end page\n" +
+            "  -nonSeq                Enables the new non-sequential parser\n" +
             "  <PDF file>             The PDF document to use\n"
             );
         System.exit( 1 );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java Wed May 16 09:18:16 2012
@@ -19,6 +19,7 @@ package org.apache.pdfbox;
 import java.awt.HeadlessException;
 import java.awt.Toolkit;
 import java.awt.image.BufferedImage;
+import java.io.File;
 import java.util.List;
 
 import javax.imageio.ImageIO;
@@ -46,6 +47,7 @@ public class PDFToImage
     private static final String COLOR = "-color";
     private static final String RESOLUTION = "-resolution";
     private static final String CROPBOX = "-cropbox";
+    private static final String NONSEQ = "-nonSeq";
 
     /**
      * private constructor.
@@ -64,6 +66,7 @@ public class PDFToImage
      */
     public static void main( String[] args ) throws Exception
     {
+        boolean useNonSeqParser = false; 
         String password = "";
         String pdfFile = null;
         String outputPrefix = null;
@@ -144,6 +147,10 @@ public class PDFToImage
                 i++;
                 cropBoxUpperRightY = Float.valueOf(args[i]).floatValue();
             }
+            else if( args[i].equals( NONSEQ ) )
+            {
+                useNonSeqParser = true;
+            }
             else
             {
                 if( pdfFile == null )
@@ -166,28 +173,32 @@ public class PDFToImage
             PDDocument document = null;
             try
             {
-                document = PDDocument.load( pdfFile );
-
-
-                //document.print();
-                if( document.isEncrypted() )
+                if (useNonSeqParser)
                 {
-                    try
-                    {
-                        document.decrypt( password );
-                    }
-                    catch( InvalidPasswordException e )
+                    document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+                }
+                else
+                {
+                    document = PDDocument.load( pdfFile );
+                    if( document.isEncrypted() )
                     {
-                        if( args.length == 4 )//they supplied the wrong password
+                        try
                         {
-                            System.err.println( "Error: The supplied password is incorrect." );
-                            System.exit( 2 );
+                            document.decrypt( password );
                         }
-                        else
+                        catch( InvalidPasswordException e )
                         {
-                            //they didn't supply a password and the default of "" was wrong.
-                            System.err.println( "Error: The document is encrypted." );
-                            usage();
+                            if( args.length == 4 )//they supplied the wrong password
+                            {
+                                System.err.println( "Error: The supplied password is incorrect." );
+                                System.exit( 2 );
+                            }
+                            else
+                            {
+                                //they didn't supply a password and the default of "" was wrong.
+                                System.err.println( "Error: The document is encrypted." );
+                                usage();
+                            }
                         }
                     }
                 }
@@ -267,6 +278,7 @@ public class PDFToImage
             "  -color <string>                The color depth (valid: bilevel, indexed, gray, rgb, rgba)\n" +
             "  -resolution <number>           The bitmap resolution in dpi\n" +
             "  -cropbox <number> <number> <number> <number> The page area to export\n" +
+            "  -nonSeq                        Enables the new non-sequential parser\n" +
             "  <PDF file>                     The PDF document to use\n"
             );
         System.exit( 1 );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java Wed May 16 09:18:16 2012
@@ -16,6 +16,7 @@
  */
 package org.apache.pdfbox;
 
+import java.io.File;
 import java.io.IOException;
 
 import java.util.Iterator;
@@ -39,6 +40,7 @@ public class WriteDecodedDoc
 {
 
     private static final String PASSWORD = "-password";
+    private static final String NONSEQ = "-nonSeq";
 
     /**
      * Constructor.
@@ -61,7 +63,7 @@ public class WriteDecodedDoc
      */
     public void doIt(String in, String out) throws IOException, COSVisitorException
     {
-        doIt(in, out, "");
+        doIt(in, out, "", false);
     }
     
     /**
@@ -70,42 +72,51 @@ public class WriteDecodedDoc
      * @param in The filename used for input.
      * @param out The filename used for output.
      * @param password The password to open the document.
+     * @param useNonSeqParser use the non sequential parser
      *
      * @throws IOException If there is an error parsing the document.
      * @throws COSVisitorException If there is an error while copying the document.
      */
-    public void doIt(String in, String out, String password) throws IOException, COSVisitorException
+    public void doIt(String in, String out, String password, boolean useNonSeqParser) 
+    throws IOException, COSVisitorException
     {
         PDDocument doc = null;
         try
         {
-            doc = PDDocument.load( in );
-            if( doc.isEncrypted() )
+            if (useNonSeqParser) 
             {
-                try
-                {
-                    doc.decrypt( password );
-                    doc.setAllSecurityToBeRemoved(true);
-                }
-                catch( InvalidPasswordException e )
+                doc = PDDocument.loadNonSeq(new File(in), null, password);
+                doc.setAllSecurityToBeRemoved(true);
+            }
+            else
+            {
+                doc = PDDocument.load( in );
+                if( doc.isEncrypted() )
                 {
-                    if (password.trim().length() == 0)
+                    try
                     {
-                        System.err.println( "Password needed!!" );
+                        doc.decrypt( password );
+                        doc.setAllSecurityToBeRemoved(true);
                     }
-                    else
+                    catch( InvalidPasswordException e )
                     {
-                        System.err.println( "Wrong password!!" );
+                        if (password.trim().length() == 0)
+                        {
+                            System.err.println( "Password needed!!" );
+                        }
+                        else
+                        {
+                            System.err.println( "Wrong password!!" );
+                        }
+                        return;
+                    }
+                    catch( org.apache.pdfbox.exceptions.CryptographyException e )
+                    {
+                        e.printStackTrace();
+                        return;
                     }
-                    return;
-                }
-                catch( org.apache.pdfbox.exceptions.CryptographyException e )
-                {
-                    e.printStackTrace();
-                    return;
                 }
             }
-
             for (Iterator<COSObject> i = doc.getDocument().getObjects().iterator(); i.hasNext();)
             {
                 COSBase base = ((COSObject) i.next()).getObject();
@@ -139,6 +150,7 @@ public class WriteDecodedDoc
     {
         WriteDecodedDoc app = new WriteDecodedDoc();
         String password = "";
+        boolean useNonSeqParser = false;
         String pdfFile = null;
         String outputFile = null;
         for( int i=0; i<args.length; i++ )
@@ -153,6 +165,11 @@ public class WriteDecodedDoc
                 password = args[i];
             }
             else
+                if( args[i].equals( NONSEQ ) )
+                {
+                    useNonSeqParser = true;
+                }
+                else
             {
                 if( pdfFile == null )
                 {
@@ -176,7 +193,7 @@ public class WriteDecodedDoc
                 {
                     outputFile = calculateOutputFilename(pdfFile);
                 }
-                app.doIt(pdfFile, outputFile, password);
+                app.doIt(pdfFile, outputFile, password, useNonSeqParser);
             }
             catch (Exception e)
             {
@@ -208,6 +225,7 @@ public class WriteDecodedDoc
         System.err.println(
                 "usage: java -jar pdfbox-app-x.y.z.jar WriteDecodedDoc [OPTIONS] <input-file> [output-file]\n" +
                 "  -password <password>      Password to decrypt the document\n" +
+                "  -nonSeq                   Enables the new non-sequential parser\n" +
                 "  <input-file>              The PDF document to be decompressed\n" +
                 "  [output-file]             The filename for the decompressed pdf\n"
                 );