You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2012/05/16 11:18:16 UTC
svn commit: r1339077 - in
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: ExtractImages.java
ExtractText.java PDFSplit.java PDFToImage.java WriteDecodedDoc.java
Author: lehmi
Date: Wed May 16 09:18:16 2012
New Revision: 1339077
URL: http://svn.apache.org/viewvc?rev=1339077&view=rev
Log:
PDFBOX-1311: added "-noSeq" command line parameter
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractImages.java Wed May 16 09:18:16 2012
@@ -46,6 +46,7 @@ public class ExtractImages
private static final String PASSWORD = "-password";
private static final String PREFIX = "-prefix";
private static final String ADDKEY = "-addkey";
+ private static final String NONSEQ = "-nonSeq";
private ExtractImages()
{
@@ -76,6 +77,7 @@ public class ExtractImages
String password = "";
String prefix = null;
boolean addKey = false;
+ boolean useNonSeqParser = false;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
@@ -100,6 +102,10 @@ public class ExtractImages
{
addKey = true;
}
+ else if( args[i].equals( NONSEQ ) )
+ {
+ useNonSeqParser = true;
+ }
else
{
if( pdfFile == null )
@@ -123,20 +129,25 @@ public class ExtractImages
try
{
- document = PDDocument.load( pdfFile );
-
- if( document.isEncrypted() )
+ if (useNonSeqParser)
{
-
- StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
- document.openProtection(spm);
- AccessPermission ap = document.getCurrentAccessPermission();
-
-
- if( ! ap.canExtractContent() )
+ document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+ }
+ else
+ {
+ document = PDDocument.load( pdfFile );
+
+ if( document.isEncrypted() )
{
- throw new IOException(
- "Error: You do not have permission to extract images." );
+ StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
+ document.openProtection(spm);
+ AccessPermission ap = document.getCurrentAccessPermission();
+
+ if( ! ap.canExtractContent() )
+ {
+ throw new IOException(
+ "Error: You do not have permission to extract images." );
+ }
}
}
@@ -224,6 +235,7 @@ public class ExtractImages
" -password <password> Password to decrypt document\n" +
" -prefix <image-prefix> Image prefix(default to pdf name)\n" +
" -addkey add the internal image key to the file name\n" +
+ " -nonSeq Enables the new non-sequential parser\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java Wed May 16 09:18:16 2012
@@ -21,8 +21,6 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
-import java.net.MalformedURLException;
-import java.net.URL;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
@@ -47,8 +45,11 @@ public class ExtractText
private static final String SORT = "-sort";
private static final String IGNORE_BEADS = "-ignoreBeads";
private static final String DEBUG = "-debug";
- private static final String HTML = "-html"; // jjb - added simple HTML output
- private static final String FORCE = "-force"; //enables pdfbox to skip corrupt objects
+ // jjb - added simple HTML output
+ private static final String HTML = "-html";
+ // enables pdfbox to skip corrupt objects
+ private static final String FORCE = "-force";
+ private static final String NONSEQ = "-nonSeq";
/*
* debug flag
@@ -75,7 +76,13 @@ public class ExtractText
ExtractText extractor = new ExtractText();
extractor.startExtraction(args);
}
-
+ /**
+ * Starts the text extraction.
+ *
+ * @param args the commandline arguments.
+ *
+ * @throws Exception if something went wrong.
+ */
public void startExtraction( String[] args ) throws Exception
{
boolean toConsole = false;
@@ -83,6 +90,7 @@ public class ExtractText
boolean force = false;
boolean sort = false;
boolean separateBeads = true;
+ boolean useNonSeqParser = false;
String password = "";
String encoding = null;
String pdfFile = null;
@@ -154,6 +162,10 @@ public class ExtractText
{
force = true;
}
+ else if( args[i].equals( NONSEQ ) )
+ {
+ useNonSeqParser = true;
+ }
else
{
if( pdfFile == null )
@@ -179,39 +191,31 @@ public class ExtractText
try
{
long startTime = startProcessing("Loading PDF "+pdfFile);
- try
+ if( outputFile == null && pdfFile.length() >4 )
{
- //basically try to load it from a url first and if the URL
- //is not recognized then try to load it from the file system.
- URL url = new URL( pdfFile );
- document = PDDocument.load(url, force);
- String fileName = url.getFile();
- if( outputFile == null && fileName.length() >4 )
- {
- outputFile = new File( fileName.substring( 0, fileName.length() -4 ) + ext ).getName();
- }
+ outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
}
- catch( MalformedURLException e )
+ if (useNonSeqParser)
+ {
+ document = PDDocument.loadNonSeq(new File( pdfFile ), null, password);
+ }
+ else
{
document = PDDocument.load(pdfFile, force);
- if( outputFile == null && pdfFile.length() >4 )
+ if( document.isEncrypted() )
{
- outputFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ext;
+ StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
+ document.openProtection( sdm );
+ AccessPermission ap = document.getCurrentAccessPermission();
+
+ if( ! ap.canExtractContent() )
+ {
+ throw new IOException( "You do not have permission to extract text" );
+ }
}
}
stopProcessing("Time for loading: ", startTime);
- if( document.isEncrypted() )
- {
- StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
- document.openProtection( sdm );
- AccessPermission ap = document.getCurrentAccessPermission();
-
- if( ! ap.canExtractContent() )
- {
- throw new IOException( "You do not have permission to extract text" );
- }
- }
if ((encoding == null) && (toHTML))
{
@@ -253,6 +257,11 @@ public class ExtractText
stripper.setEndPage( endPage );
startTime = startProcessing("Starting text extraction");
+ if (debug)
+ {
+ System.err.println("Writing to "+outputFile);
+ }
+
stripper.writeText( document, output );
stopProcessing("Time for extraction: ", startTime);
}
@@ -270,7 +279,8 @@ public class ExtractText
}
}
- private long startProcessing(String message) {
+ private long startProcessing(String message)
+ {
if (debug)
{
System.err.println(message);
@@ -278,7 +288,8 @@ public class ExtractText
return System.currentTimeMillis();
}
- private void stopProcessing(String message, long startTime) {
+ private void stopProcessing(String message, long startTime)
+ {
if (debug)
{
long stopTime = System.currentTimeMillis();
@@ -303,6 +314,7 @@ public class ExtractText
" -debug Enables debug output about the time consumption of every stage\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
+ " -nonSeq Enables the new non-sequential parser\n" +
" <PDF file> The PDF document to use\n" +
" [Text File] The file to write the text to\n"
);
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFSplit.java Wed May 16 09:18:16 2012
@@ -16,9 +16,8 @@
*/
package org.apache.pdfbox;
+import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.List;
@@ -26,8 +25,6 @@ import java.util.List;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.exceptions.COSVisitorException;
-import org.apache.pdfbox.pdfparser.PDFParser;
-
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdfwriter.COSWriter;
@@ -47,6 +44,7 @@ public class PDFSplit
private static final String SPLIT = "-split";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
+ private static final String NONSEQ = "-nonSeq";
private PDFSplit()
{
@@ -70,7 +68,7 @@ public class PDFSplit
String split = null;
String startPage = null;
String endPage = null;
-
+ boolean useNonSeqParser = false;
Splitter splitter = new Splitter();
String pdfFile = null;
for( int i=0; i<args.length; i++ )
@@ -111,6 +109,10 @@ public class PDFSplit
}
endPage = args[i];
}
+ else if( args[i].equals( NONSEQ ) )
+ {
+ useNonSeqParser = true;
+ }
else
{
if( pdfFile == null )
@@ -126,33 +128,36 @@ public class PDFSplit
}
else
{
-
- InputStream input = null;
PDDocument document = null;
List<PDDocument> documents = null;
try
{
- input = new FileInputStream( pdfFile );
- document = parseDocument( input );
-
- if( document.isEncrypted() )
+ if (useNonSeqParser)
{
- try
- {
- document.decrypt( password );
- }
- catch( InvalidPasswordException e )
+ document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+ }
+ else
+ {
+ document = PDDocument.load(pdfFile);
+ if( document.isEncrypted() )
{
- if( args.length == 4 )//they supplied the wrong password
+ try
{
- System.err.println( "Error: The supplied password is incorrect." );
- System.exit( 2 );
+ document.decrypt( password );
}
- else
+ catch( InvalidPasswordException e )
{
- //they didn't supply a password and the default of "" was wrong.
- System.err.println( "Error: The document is encrypted." );
- usage();
+ if( args.length == 4 )//they supplied the wrong password
+ {
+ System.err.println( "Error: The supplied password is incorrect." );
+ System.exit( 2 );
+ }
+ else
+ {
+ //they didn't supply a password and the default of "" was wrong.
+ System.err.println( "Error: The document is encrypted." );
+ usage();
+ }
}
}
}
@@ -199,10 +204,6 @@ public class PDFSplit
}
finally
{
- if( input != null )
- {
- input.close();
- }
if( document != null )
{
document.close();
@@ -240,22 +241,6 @@ public class PDFSplit
}
/**
- * This will parse a document.
- *
- * @param input The input stream for the document.
- *
- * @return The document.
- *
- * @throws IOException If there is an error parsing the document.
- */
- private static PDDocument parseDocument( InputStream input )throws IOException
- {
- PDFParser parser = new PDFParser( input );
- parser.parse();
- return parser.getPDDocument();
- }
-
- /**
* This will print the usage requirements and exit.
*/
private static void usage()
@@ -265,6 +250,7 @@ public class PDFSplit
" -split <integer> split after this many pages (default 1, if startPage and endPage are unset)\n"+
" -startPage <integer> start page\n" +
" -endPage <integer> end page\n" +
+ " -nonSeq Enables the new non-sequential parser\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/PDFToImage.java Wed May 16 09:18:16 2012
@@ -19,6 +19,7 @@ package org.apache.pdfbox;
import java.awt.HeadlessException;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
+import java.io.File;
import java.util.List;
import javax.imageio.ImageIO;
@@ -46,6 +47,7 @@ public class PDFToImage
private static final String COLOR = "-color";
private static final String RESOLUTION = "-resolution";
private static final String CROPBOX = "-cropbox";
+ private static final String NONSEQ = "-nonSeq";
/**
* private constructor.
@@ -64,6 +66,7 @@ public class PDFToImage
*/
public static void main( String[] args ) throws Exception
{
+ boolean useNonSeqParser = false;
String password = "";
String pdfFile = null;
String outputPrefix = null;
@@ -144,6 +147,10 @@ public class PDFToImage
i++;
cropBoxUpperRightY = Float.valueOf(args[i]).floatValue();
}
+ else if( args[i].equals( NONSEQ ) )
+ {
+ useNonSeqParser = true;
+ }
else
{
if( pdfFile == null )
@@ -166,28 +173,32 @@ public class PDFToImage
PDDocument document = null;
try
{
- document = PDDocument.load( pdfFile );
-
-
- //document.print();
- if( document.isEncrypted() )
+ if (useNonSeqParser)
{
- try
- {
- document.decrypt( password );
- }
- catch( InvalidPasswordException e )
+ document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+ }
+ else
+ {
+ document = PDDocument.load( pdfFile );
+ if( document.isEncrypted() )
{
- if( args.length == 4 )//they supplied the wrong password
+ try
{
- System.err.println( "Error: The supplied password is incorrect." );
- System.exit( 2 );
+ document.decrypt( password );
}
- else
+ catch( InvalidPasswordException e )
{
- //they didn't supply a password and the default of "" was wrong.
- System.err.println( "Error: The document is encrypted." );
- usage();
+ if( args.length == 4 )//they supplied the wrong password
+ {
+ System.err.println( "Error: The supplied password is incorrect." );
+ System.exit( 2 );
+ }
+ else
+ {
+ //they didn't supply a password and the default of "" was wrong.
+ System.err.println( "Error: The document is encrypted." );
+ usage();
+ }
}
}
}
@@ -267,6 +278,7 @@ public class PDFToImage
" -color <string> The color depth (valid: bilevel, indexed, gray, rgb, rgba)\n" +
" -resolution <number> The bitmap resolution in dpi\n" +
" -cropbox <number> <number> <number> <number> The page area to export\n" +
+ " -nonSeq Enables the new non-sequential parser\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java?rev=1339077&r1=1339076&r2=1339077&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/WriteDecodedDoc.java Wed May 16 09:18:16 2012
@@ -16,6 +16,7 @@
*/
package org.apache.pdfbox;
+import java.io.File;
import java.io.IOException;
import java.util.Iterator;
@@ -39,6 +40,7 @@ public class WriteDecodedDoc
{
private static final String PASSWORD = "-password";
+ private static final String NONSEQ = "-nonSeq";
/**
* Constructor.
@@ -61,7 +63,7 @@ public class WriteDecodedDoc
*/
public void doIt(String in, String out) throws IOException, COSVisitorException
{
- doIt(in, out, "");
+ doIt(in, out, "", false);
}
/**
@@ -70,42 +72,51 @@ public class WriteDecodedDoc
* @param in The filename used for input.
* @param out The filename used for output.
* @param password The password to open the document.
+ * @param useNonSeqParser use the non sequential parser
*
* @throws IOException If there is an error parsing the document.
* @throws COSVisitorException If there is an error while copying the document.
*/
- public void doIt(String in, String out, String password) throws IOException, COSVisitorException
+ public void doIt(String in, String out, String password, boolean useNonSeqParser)
+ throws IOException, COSVisitorException
{
PDDocument doc = null;
try
{
- doc = PDDocument.load( in );
- if( doc.isEncrypted() )
+ if (useNonSeqParser)
{
- try
- {
- doc.decrypt( password );
- doc.setAllSecurityToBeRemoved(true);
- }
- catch( InvalidPasswordException e )
+ doc = PDDocument.loadNonSeq(new File(in), null, password);
+ doc.setAllSecurityToBeRemoved(true);
+ }
+ else
+ {
+ doc = PDDocument.load( in );
+ if( doc.isEncrypted() )
{
- if (password.trim().length() == 0)
+ try
{
- System.err.println( "Password needed!!" );
+ doc.decrypt( password );
+ doc.setAllSecurityToBeRemoved(true);
}
- else
+ catch( InvalidPasswordException e )
{
- System.err.println( "Wrong password!!" );
+ if (password.trim().length() == 0)
+ {
+ System.err.println( "Password needed!!" );
+ }
+ else
+ {
+ System.err.println( "Wrong password!!" );
+ }
+ return;
+ }
+ catch( org.apache.pdfbox.exceptions.CryptographyException e )
+ {
+ e.printStackTrace();
+ return;
}
- return;
- }
- catch( org.apache.pdfbox.exceptions.CryptographyException e )
- {
- e.printStackTrace();
- return;
}
}
-
for (Iterator<COSObject> i = doc.getDocument().getObjects().iterator(); i.hasNext();)
{
COSBase base = ((COSObject) i.next()).getObject();
@@ -139,6 +150,7 @@ public class WriteDecodedDoc
{
WriteDecodedDoc app = new WriteDecodedDoc();
String password = "";
+ boolean useNonSeqParser = false;
String pdfFile = null;
String outputFile = null;
for( int i=0; i<args.length; i++ )
@@ -153,6 +165,11 @@ public class WriteDecodedDoc
password = args[i];
}
else
+ if( args[i].equals( NONSEQ ) )
+ {
+ useNonSeqParser = true;
+ }
+ else
{
if( pdfFile == null )
{
@@ -176,7 +193,7 @@ public class WriteDecodedDoc
{
outputFile = calculateOutputFilename(pdfFile);
}
- app.doIt(pdfFile, outputFile, password);
+ app.doIt(pdfFile, outputFile, password, useNonSeqParser);
}
catch (Exception e)
{
@@ -208,6 +225,7 @@ public class WriteDecodedDoc
System.err.println(
"usage: java -jar pdfbox-app-x.y.z.jar WriteDecodedDoc [OPTIONS] <input-file> [output-file]\n" +
" -password <password> Password to decrypt the document\n" +
+ " -nonSeq Enables the new non-sequential parser\n" +
" <input-file> The PDF document to be decompressed\n" +
" [output-file] The filename for the decompressed pdf\n"
);