You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ms...@apache.org on 2020/12/06 17:49:29 UTC
svn commit: r1884160 - in /pdfbox/trunk/tools/src:
main/java/org/apache/pdfbox/tools/ExtractText.java
test/java/org/apache/pdfbox/tools/TestExtractText.java
Author: msahyoun
Date: Sun Dec 6 17:49:29 2020
New Revision: 1884160
URL: http://svn.apache.org/viewvc?rev=1884160&view=rev
Log:
PDFBOX-2602: use picocli for command line parsing
Modified:
pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestExtractText.java
Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1884160&r1=1884159&r2=1884160&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Sun Dec 6 17:49:29 2020
@@ -21,16 +21,20 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
+import java.io.PrintStream;
import java.io.Writer;
+import java.util.Arrays;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
+import java.util.concurrent.Callable;
+
+import org.apache.commons.io.FilenameUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -44,6 +48,11 @@ import org.apache.pdfbox.text.PDFTextStr
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
+import picocli.CommandLine;
+import picocli.CommandLine.Command;
+import picocli.CommandLine.Option;
+import picocli.CommandLine.Parameters;
+
/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
@@ -51,274 +60,212 @@ import org.apache.pdfbox.util.Matrix;
* @author Ben Litchfield
* @author Tilman Hausherr
*/
-public final class ExtractText
+@Command(name = "ExtractText", description = "Extract all text from the given PDF document.")
+public final class ExtractText implements Callable<Integer>
{
private static final Log LOG = LogFactory.getLog(ExtractText.class);
- @SuppressWarnings({"squid:S2068"})
- private static final String PASSWORD = "-password";
- private static final String ENCODING = "-encoding";
- private static final String CONSOLE = "-console";
- private static final String START_PAGE = "-startPage";
- private static final String END_PAGE = "-endPage";
- private static final String SORT = "-sort";
- private static final String IGNORE_BEADS = "-ignoreBeads";
- private static final String DEBUG = "-debug";
- private static final String HTML = "-html";
- private static final String ALWAYSNEXT = "-alwaysNext";
- private static final String ROTATION_MAGIC = "-rotationMagic";
private static final String STD_ENCODING = "UTF-8";
- /*
- * debug flag
- */
- private boolean debugOutput = false;
+ // Expected for CLI app to write to System.out/Sytem.err
+ @SuppressWarnings("squid:S106")
+ private static final PrintStream SYSOUT = System.out;
+ @SuppressWarnings("squid:S106")
+ private static final PrintStream SYSERR = System.err;
- /**
- * private constructor.
- */
- private ExtractText()
- {
- //static class
- }
+ @Option(names = "-alwaysNext", description = "Process next page (if applicable) despite IOException " +
+ "(ignored when -html)")
+ private boolean alwaysNext = false;
+
+ @Option(names = "-console", description = "Send text to console instead of file")
+ private boolean toConsole = false;
+
+ @Option(names = "-debug", description = "Enables debug output about the time consumption of every stage")
+ private boolean debug = false;
+
+ @Option(names = "-encoding", description = "UTF-8 or ISO-8859-1, UTF-16BE, UTF-16LE, etc. (default: ${DEFAULT-VALUE})")
+ private String encoding = STD_ENCODING;
+
+ @Option(names = "-endPage", description = "The last page to extract (1 based, inclusive)")
+ private int endPage = Integer.MAX_VALUE;
+
+ @Option(names = "-html", description = "Output in HTML format instead of raw text")
+ private boolean toHTML = false;
+
+ @Option(names = "-ignoreBeads", description = "Disables the separation by beads")
+ private boolean ignoreBeads = false;
+
+ @Option(names = {"-h", "--help"}, usageHelp = true, description = "display this help message")
+ boolean usageHelpRequested;
+
+ @Option(names = "-password", description = "the password for the PDF or certificate in keystore.")
+ private String password = "";
+
+ @Option(names = "-rotationMagic", description = "Analyze each page for rotated/skewed text, rotate to 0° " +
+ "and extract separately (slower, and ignored when -html)" )
+ private boolean rotationMagic = false;
+
+ @Option(names = "-sort", description = "Sort the text before writing of every stage")
+ private boolean sort = false;
+
+ @Option(names = "-startPage", description = "The first page to start extraction (1 based)")
+ private int startPage = 1;
+
+ @Parameters(paramLabel = "inputfile", index = "0", arity = "1", description = "the PDF file to decrypt.")
+ private File infile;
+
+ @Parameters(paramLabel = "outputfile", index = "1", arity = "0..1", description = "the decrypted PDF file.")
+ private File outfile;
/**
* Infamous main method.
*
* @param args Command line arguments, should be one and a reference to a file.
- *
- * @throws IOException if there is an error reading the document or extracting the text.
*/
- public static void main( String[] args ) throws IOException
+ public static void main( String[] args )
{
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
- ExtractText extractor = new ExtractText();
- extractor.startExtraction(args);
+ int exitCode = new CommandLine(new ExtractText()).execute(args);
+ System.exit(exitCode);
}
+
+ /*
+ * For testing as SureFire doesn't support testing methods which
+ * call System.exit
+ */
+ static int test (String[] args)
+ {
+ return new CommandLine(new ExtractText()).execute(args);
+ }
+
/**
* Starts the text extraction.
*
- * @param args the commandline arguments.
- * @throws IOException if there is an error reading the document or extracting the text.
*/
- public void startExtraction( String[] args ) throws IOException
+ public Integer call()
{
- boolean toConsole = false;
- boolean toHTML = false;
- boolean sort = false;
- boolean separateBeads = true;
- boolean alwaysNext = false;
- boolean rotationMagic = false;
- @SuppressWarnings({"squid:S2068"})
- String password = "";
- String encoding = STD_ENCODING;
- String pdfFile = null;
- String outputFile = null;
- // Defaults to text files
- String ext = ".txt";
- int startPage = 1;
- int endPage = Integer.MAX_VALUE;
- for (int i = 0; i < args.length; i++)
- {
- switch (args[i])
- {
- case PASSWORD:
- i++;
- if (i >= args.length)
- {
- usage();
- }
- password = args[i];
- break;
- case ENCODING:
- i++;
- if (i >= args.length)
- {
- usage();
- }
- encoding = args[i];
- break;
- case START_PAGE:
- i++;
- if (i >= args.length)
- {
- usage();
- }
- startPage = Integer.parseInt(args[i]);
- break;
- case HTML:
- toHTML = true;
- ext = ".html";
- break;
- case SORT:
- sort = true;
- break;
- case IGNORE_BEADS:
- separateBeads = false;
- break;
- case DEBUG:
- debugOutput = true;
- break;
- case ALWAYSNEXT:
- alwaysNext = true;
- break;
- case ROTATION_MAGIC:
- rotationMagic = true;
- break;
- case END_PAGE:
- i++;
- if (i >= args.length)
- {
- usage();
- }
- endPage = Integer.parseInt(args[i]);
- break;
- case CONSOLE:
- toConsole = true;
- break;
- default:
- if (pdfFile == null)
- {
- pdfFile = args[i];
- }
- else
- {
- outputFile = args[i];
- }
- break;
- }
- }
+ // set file extension
+ String ext = toHTML ? ".html" : ".txt";
- if( pdfFile == null )
+ if (outfile == null)
{
- usage();
+ String outPath = FilenameUtils.removeExtension(infile.getAbsolutePath()) + ext;
+ outfile = new File(outPath);
}
- else
+
+ try (PDDocument document = Loader.loadPDF(infile, password);
+ Writer output = toConsole ? new OutputStreamWriter( SYSOUT, encoding ) : new OutputStreamWriter( new FileOutputStream( outfile ), encoding ))
{
+ long startTime = startProcessing("Loading PDF " + infile);
- Writer output = null;
- PDDocument document = null;
- try
+ AccessPermission ap = document.getCurrentAccessPermission();
+ if( ! ap.canExtractContent() )
{
- long startTime = startProcessing("Loading PDF "+pdfFile);
- if( outputFile == null && pdfFile.length() >4 )
- {
- outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
- }
- document = Loader.loadPDF(new File(pdfFile), password);
-
- AccessPermission ap = document.getCurrentAccessPermission();
- if( ! ap.canExtractContent() )
- {
- throw new IOException( "You do not have permission to extract text" );
- }
-
- stopProcessing("Time for loading: ", startTime);
+ SYSERR.println( "You do not have permission to extract text");
+ return 1;
+ }
+
+ stopProcessing("Time for loading: ", startTime);
- if( toConsole )
- {
- output = new OutputStreamWriter( System.out, encoding );
- }
- else
- {
- if (toHTML && !STD_ENCODING.equals(encoding))
- {
- encoding = STD_ENCODING;
- System.out.println("The encoding parameter is ignored when writing html output.");
- }
- output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
- }
- startTime = startProcessing("Starting text extraction");
- if (debugOutput)
- {
- System.err.println("Writing to " + outputFile);
- }
+ if (toHTML && !STD_ENCODING.equals(encoding))
+ {
+ encoding = STD_ENCODING;
+ SYSOUT.println("The encoding parameter is ignored when writing html output.");
+ }
- PDFTextStripper stripper;
- if(toHTML)
- {
- // HTML stripper can't work page by page because of startDocument() callback
- stripper = new PDFText2HTML();
- stripper.setSortByPosition(sort);
- stripper.setShouldSeparateByBeads(separateBeads);
- stripper.setStartPage(startPage);
- stripper.setEndPage(endPage);
+ startTime = startProcessing("Starting text extraction");
- // Extract text for main document:
- stripper.writeText(document, output);
+ if (debug)
+ {
+ SYSERR.println("Writing to " + outfile.getAbsolutePath());
+ }
+
+ PDFTextStripper stripper;
+ if(toHTML)
+ {
+ // HTML stripper can't work page by page because of startDocument() callback
+ stripper = new PDFText2HTML();
+ stripper.setSortByPosition(sort);
+ stripper.setShouldSeparateByBeads(!ignoreBeads);
+ stripper.setStartPage(startPage);
+ stripper.setEndPage(endPage);
+
+ // Extract text for main document:
+ stripper.writeText(document, output);
+ }
+ else
+ {
+ if (rotationMagic)
+ {
+ stripper = new FilteredTextStripper();
}
else
{
- if (rotationMagic)
- {
- stripper = new FilteredTextStripper();
- }
- else
- {
- stripper = new PDFTextStripper();
- }
- stripper.setSortByPosition(sort);
- stripper.setShouldSeparateByBeads(separateBeads);
-
- // Extract text for main document:
- extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
- stripper, document, output, rotationMagic, alwaysNext);
+ stripper = new PDFTextStripper();
}
+ stripper.setSortByPosition(sort);
+ stripper.setShouldSeparateByBeads(!ignoreBeads);
- // ... also for any embedded PDFs:
- PDDocumentCatalog catalog = document.getDocumentCatalog();
- PDDocumentNameDictionary names = catalog.getNames();
- if (names != null)
+ // Extract text for main document:
+ extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
+ stripper, document, output, rotationMagic, alwaysNext);
+ }
+
+ // ... also for any embedded PDFs:
+ PDDocumentCatalog catalog = document.getDocumentCatalog();
+ PDDocumentNameDictionary names = catalog.getNames();
+ if (names != null)
+ {
+ PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
+ if (embeddedFiles != null)
{
- PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
- if (embeddedFiles != null)
+ Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
+ if (embeddedFileNames != null)
{
- Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
- if (embeddedFileNames != null)
+ for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
{
- for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
+ if (debug)
{
- if (debugOutput)
+ SYSERR.println("Processing embedded file " + ent.getKey() + ":");
+ }
+ PDComplexFileSpecification spec = ent.getValue();
+ PDEmbeddedFile file = spec.getEmbeddedFile();
+ if (file != null && "application/pdf".equals(file.getSubtype()))
+ {
+ if (debug)
{
- System.err.println("Processing embedded file " + ent.getKey() + ":");
+ SYSERR.println(" is PDF (size=" + file.getSize() + ")");
}
- PDComplexFileSpecification spec = ent.getValue();
- PDEmbeddedFile file = spec.getEmbeddedFile();
- if (file != null && "application/pdf".equals(file.getSubtype()))
+ try (InputStream fis = file.createInputStream();
+ PDDocument subDoc = Loader.loadPDF(fis))
{
- if (debugOutput)
+ if (toHTML)
{
- System.err.println(" is PDF (size=" + file.getSize() + ")");
+ // will not really work because of HTML header + footer
+ stripper.writeText( subDoc, output );
}
- try (InputStream fis = file.createInputStream();
- PDDocument subDoc = Loader.loadPDF(fis))
+ else
{
- if (toHTML)
- {
- // will not really work because of HTML header + footer
- stripper.writeText( subDoc, output );
- }
- else
- {
- extractPages(1, subDoc.getNumberOfPages(),
- stripper, subDoc, output, rotationMagic, alwaysNext);
- }
- }
- }
- }
- }
+ extractPages(1, subDoc.getNumberOfPages(),
+ stripper, subDoc, output, rotationMagic, alwaysNext);
+ }
+ }
+ }
+ }
}
}
- stopProcessing("Time for extraction: ", startTime);
- }
- finally
- {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(document);
}
+ stopProcessing("Time for extraction: ", startTime);
}
+ catch (IOException ioe)
+ {
+ SYSERR.println( "Error extracting test for document: " + ioe.getMessage());
+ return 4;
+ }
+
+ return 0;
}
private void extractPages(int startPage, int endPage,
@@ -376,20 +323,20 @@ public final class ExtractText
private long startProcessing(String message)
{
- if (debugOutput)
+ if (debug)
{
- System.err.println(message);
+ SYSERR.println(message);
}
return System.currentTimeMillis();
}
private void stopProcessing(String message, long startTime)
{
- if (debugOutput)
+ if (debug)
{
long stopTime = System.currentTimeMillis();
float elapsedTime = ((float)(stopTime - startTime))/1000;
- System.err.println(message + elapsedTime + " seconds");
+ SYSERR.println(message + elapsedTime + " seconds");
}
}
@@ -400,36 +347,6 @@ public final class ExtractText
m.concatenate(text.getFont().getFontMatrix());
return (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
}
-
- /**
- * This will print the usage requirements and exit.
- */
- private static void usage()
- {
- String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n"
- + "\nOptions:\n"
- + " -password <password> : Password to decrypt document\n"
- + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE,\n"
- + " UTF-16LE, etc.\n"
- + " -console : Send text to console instead of file\n"
- + " -html : Output in HTML format instead of raw text\n"
- + " -sort : Sort the text before writing\n"
- + " -ignoreBeads : Disables the separation by beads\n"
- + " -debug : Enables debug output about the time consumption\n"
- + " of every stage\n"
- + " -alwaysNext : Process next page (if applicable) despite\n"
- + " IOException (ignored when -html)\n"
- + " -rotationMagic : Analyze each page for rotated/skewed text,\n"
- + " rotate to 0° and extract separately\n"
- + " (slower, and ignored when -html)\n"
- + " -startPage <number> : The first page to start extraction (1 based)\n"
- + " -endPage <number> : The last page to extract (1 based, inclusive)\n"
- + " <inputfile> : The PDF document to use\n"
- + " [output-text-file] : The file to write the text to";
-
- System.err.println(message);
- System.exit( 1 );
- }
}
/**
Modified: pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestExtractText.java?rev=1884160&r1=1884159&r2=1884160&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestExtractText.java (original)
+++ pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestExtractText.java Sun Dec 6 17:49:29 2020
@@ -43,8 +43,8 @@ class TestExtractText
System.setOut(new PrintStream(outBytes));
try
{
- ExtractText.main(new String[]{"src/test/resources/org/apache/pdfbox/testPDFPackage.pdf",
- "-console", "-encoding UTF-8"});
+ ExtractText.test(new String[]{"src/test/resources/org/apache/pdfbox/testPDFPackage.pdf",
+ "-console", "-encoding", "UTF-8"});
}
finally
{