You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2023/05/18 17:30:38 UTC

svn commit: r1909914 - in /pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools: ExtractXMP.java PDFBox.java

Author: tilman
Date: Thu May 18 17:30:38 2023
New Revision: 1909914

URL: http://svn.apache.org/viewvc?rev=1909914&view=rev
Log:
PDFBOX-5598: create a command line utiliy to extract document XMP content or page XMP content

Added:
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractXMP.java   (with props)
Modified:
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFBox.java

Added: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractXMP.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractXMP.java?rev=1909914&view=auto
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractXMP.java (added)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractXMP.java Thu May 18 17:30:38 2023
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import java.util.concurrent.Callable;
+
+import org.apache.commons.io.FilenameUtils;
+
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
+
+import picocli.CommandLine;
+
+/**
+ * Extract the XMP metadata from the document or from a page.
+ *
+ * @author Tilman Hausherr
+ */
+@CommandLine.Command(name = "extractxmp", header = "Extracts the xmp stream from a PDF document", versionProvider = Version.class, mixinStandardHelpOptions = true)
+public class ExtractXMP implements Callable<Integer>
+{
+    // Expected for CLI app to write to System.out/System.err
+    @SuppressWarnings("squid:S106")
+    private static final PrintStream SYSOUT = System.out;
+    @SuppressWarnings("squid:S106")
+    private static final PrintStream SYSERR = System.err;
+
+    @CommandLine.Option(names = "-page", description = "extract the XMP information from a specific page (1 based)")
+    private int page = 0;
+
+    @CommandLine.Option(names = "-password", description = "the password for the PDF or certificate in keystore.", arity = "0..1", interactive = true)    
+    private String password = "";
+
+    @CommandLine.Option(names = "-console", description = "Send text to console instead of file")
+    private boolean toConsole = false;
+
+    @CommandLine.Option(names = {"-i", "--input"}, description = "the PDF file", required = true)
+    private File infile;
+
+    @CommandLine.Option(names = {"-o", "--output"}, description = "the exported text file")
+    private File outfile;
+
+    /**
+     * Infamous main method.
+     *
+     * @param args Command line arguments, should be one and a reference to a file.
+     */
+    public static void main(String[] args)
+    {
+        // suppress the Dock icon on OS X
+        System.setProperty("apple.awt.UIElement", "true");
+
+        int exitCode = new CommandLine(new ExtractText()).execute(args);
+        System.exit(exitCode);
+    }
+
+    /**
+     * Starts the xmp extraction.
+     */
+    @Override
+    public Integer call()
+    {
+        if (outfile == null)
+        {
+            String outPath = FilenameUtils.removeExtension(infile.getAbsolutePath()) + ".xml";
+            outfile = new File(outPath);
+        }
+        
+        try (PDDocument document = Loader.loadPDF(infile, password))
+        {
+            PDDocumentCatalog catalog = document.getDocumentCatalog();
+            PDMetadata meta;
+            if (page == 0)
+            {
+                meta = catalog.getMetadata();
+            }
+            else
+            {
+                if (page > document.getNumberOfPages())
+                {
+                    SYSERR.println("Page " + page + " doesn't exist");
+                    return 1;
+                }
+                meta = document.getPage(page - 1).getMetadata();
+            }
+            if (meta == null)
+            {
+                SYSERR.println("No XMP metadata available");
+                return 1;
+            }
+            try (PrintStream ps = toConsole ? SYSOUT : new PrintStream(outfile))
+            {
+                ps.write(meta.toByteArray());
+            }
+        }
+        catch (IOException ioe)
+        {
+            SYSERR.println( "Error extracting text for document [" + ioe.getClass().getSimpleName() + "]: " + ioe.getMessage());
+            return 4;
+        }
+
+        return 0;
+    }    
+}

Propchange: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractXMP.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFBox.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFBox.java?rev=1909914&r1=1909913&r2=1909914&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFBox.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFBox.java Thu May 18 17:30:38 2023
@@ -58,6 +58,7 @@ public final class PDFBox implements Run
         commandLine.addSubcommand("encrypt", Encrypt.class);
         commandLine.addSubcommand("decode", WriteDecodedDoc.class);
         commandLine.addSubcommand("export:images", ExtractImages.class);
+        commandLine.addSubcommand("export:xmp", ExtractXMP.class);
         commandLine.addSubcommand("export:text", ExtractText.class);
         commandLine.addSubcommand("export:fdf", ExportFDF.class);
         commandLine.addSubcommand("export:xfdf", ExportXFDF.class);