You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/09/05 19:40:30 UTC
svn commit: r1622746 - in /pdfbox/trunk: pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImage.java tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java

Author: jahewson
Date: Fri Sep  5 17:40:30 2014
New Revision: 1622746

URL: http://svn.apache.org/r1622746
Log:
PDFBOX-2313: Only extract images which are used in the content stream

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImage.java
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImage.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImage.java?rev=1622746&r1=1622745&r2=1622746&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImage.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImage.java Fri Sep  5 17:40:30 2014
@@ -132,4 +132,9 @@ public interface PDImage extends COSObje
      * Sets the Interpolate flag, true for high-quality image scaling.
      */
     public void setInterpolate(boolean value);
+
+    /**
+     * Returns the suffix for this image type, e.g. "jpg"
+     */
+    public String getSuffix();
 }

Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java?rev=1622746&r1=1622745&r2=1622746&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java Fri Sep  5 17:40:30 2014
@@ -16,17 +16,15 @@
  */
 package org.apache.pdfbox.tools;
 
+import java.awt.geom.Point2D;
 import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.io.IOUtils;
@@ -37,64 +35,60 @@ import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
-import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
 import org.apache.pdfbox.pdmodel.graphics.image.TIFFInputStream;
 import org.apache.pdfbox.util.ImageIOUtil;
+import org.apache.pdfbox.util.PDFGraphicsStreamEngine;
 
 /**
- * This will read a read pdf and extract images. <br/><br/>
+ * Extracts the images from a PDF file.
  *
- * usage: java org.apache.pdfbox.tools.ExtractImages &lt;pdffile&gt; &lt;password&gt; [imageprefix]
+ * <p>usage: java org.apache.pdfbox.tools.ExtractImages &lt;pdffile&gt; &lt;password&gt; [imageprefix]
  *
- * @author  <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
- * @version $Revision: 1.7 $
+ * @author Ben Litchfield
  */
 public class ExtractImages
 {
-    private int imageCounter = 1;
-    private Set<COSStream> seen = new HashSet<COSStream>();
-
     private static final String PASSWORD = "-password";
     private static final String PREFIX = "-prefix";
-    private static final String ADDKEY = "-addkey";
     private static final String NONSEQ = "-nonSeq";
     private static final String DIRECTJPEG = "-directJPEG";
 
-    private static final List<String> DCT_FILTERS = new ArrayList<String>();
+    private static final List<String> JPEG = Arrays.asList(
+            COSName.DCT_DECODE.getName(),
+            COSName.DCT_DECODE_ABBREVIATION.getName());
 
-    static
-    {
-        DCT_FILTERS.add( COSName.DCT_DECODE.getName() );
-        DCT_FILTERS.add( COSName.DCT_DECODE_ABBREVIATION.getName() );
-    }
+    private boolean directJPEG;
+    private String prefix;
+
+    private Set<COSStream> seen = new HashSet<COSStream>();
+    private int imageCounter = 1;
 
     private ExtractImages()
     {
     }
 
     /**
-     * This is the entry point for the application.
+     * Entry point for the application.
      *
      * @param args The command-line arguments.
-     *
      * @throws Exception If there is an error decrypting the document.
      */
-    public static void main( String[] args ) throws Exception
+    public static void main(String[] args) throws Exception
     {
         // suppress the Dock icon on OS X
         System.setProperty("apple.awt.UIElement", "true");
 
         ExtractImages extractor = new ExtractImages();
-        extractor.extractImages( args );
+        extractor.run(args);
     }
 
-    private void extractImages( String[] args ) throws Exception
+    private void run(String[] args) throws Exception
     {
-        if( args.length < 1 || args.length > 4 )
+        if (args.length < 1 || args.length > 4)
         {
             usage();
         }
@@ -102,166 +96,230 @@ public class ExtractImages
         {
             String pdfFile = null;
             String password = "";
-            String prefix = null;
-            boolean addKey = false;
             boolean useNonSeqParser = false;
-            boolean directJPEG = false;
-            for( int i=0; i<args.length; i++ )
+            for(int i = 0; i < args.length; i++)
             {
-                if( args[i].equals( PASSWORD ) )
+                if (args[i].equals(PASSWORD))
                 {
                     i++;
-                    if( i >= args.length )
+                    if (i >= args.length)
                     {
                         usage();
                     }
                     password = args[i];
                 }
-                else if( args[i].equals( PREFIX ) )
+                else if (args[i].equals(PREFIX))
                 {
                     i++;
-                    if( i >= args.length )
+                    if (i >= args.length)
                     {
                         usage();
                     }
                     prefix = args[i];
                 }
-                else if( args[i].equals( ADDKEY ) )
-                {
-                    addKey = true;
-                }
-                else if( args[i].equals( NONSEQ ) )
+                else if (args[i].equals(NONSEQ))
                 {
                     useNonSeqParser = true;
                 }
-                else if( args[i].equals( DIRECTJPEG ) )
+                else if (args[i].equals(DIRECTJPEG))
                 {
                     directJPEG = true;
                 }
                 else
                 {
-                    if( pdfFile == null )
+                    if (pdfFile == null)
                     {
                         pdfFile = args[i];
                     }
                 }
             }
-            if(pdfFile == null)
+            if (pdfFile == null)
             {
                 usage();
             }
             else
             {
-                if( prefix == null && pdfFile.length() >4 )
+                if (prefix == null && pdfFile.length() >4)
                 {
-                    prefix = pdfFile.substring( 0, pdfFile.length() -4 );
+                    prefix = pdfFile.substring(0, pdfFile.length() -4);
                 }
 
-                PDDocument document = null;
+                extract(pdfFile, password, useNonSeqParser);
+            }
+        }
+    }
 
-                try
-                {
-                    if (useNonSeqParser)
-                    {
-                        document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
-                    }
-                    else
-                    {
-                        document = PDDocument.load( pdfFile );
-    
-                        if( document.isEncrypted() )
-                        {
-                            StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
-                            document.openProtection(spm);
-                        }
-                    }
-                    AccessPermission ap = document.getCurrentAccessPermission();
-                    if( ! ap.canExtractContent() )
-                    {
-                        throw new IOException(
-                            "Error: You do not have permission to extract images." );
-                    }
+    /**
+     * Print the usage requirements and exit.
+     */
+    private static void usage()
+    {
+        System.err.println("Usage: java org.apache.pdfbox.tools.ExtractImages [OPTIONS] <PDF file>\n" +
+                "  -password  <password>        Password to decrypt document\n" +
+                "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n" +
+                "  -nonSeq                      Enables the new non-sequential parser\n" +
+                "  -directJPEG                  Forces the direct extraction of JPEG images regardless of colorspace\n" +
+                "  <PDF file>                   The PDF document to use\n");
+        System.exit(1);
+    }
 
-                    List pages = document.getDocumentCatalog().getAllPages();
-                    Iterator iter = pages.iterator();
-                    while( iter.hasNext() )
-                    {
-                        PDPage page = (PDPage)iter.next();
-                        PDResources resources = page.getResources();
-                        // extract all XObjectImages which are part of the page resources
-                        processResources(resources, prefix, addKey, directJPEG);
-                    }
-                }
-                finally
+    private void extract(String pdfFile, String password, boolean useNonSeq) throws IOException
+    {
+        PDDocument document = null;
+        try
+        {
+            if (useNonSeq)
+            {
+                document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
+            }
+            else
+            {
+                document = PDDocument.load(pdfFile);
+
+                if (document.isEncrypted())
                 {
-                    if( document != null )
-                    {
-                        document.close();
-                    }
+                    StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
+                    document.openProtection(spm);
                 }
             }
+            AccessPermission ap = document.getCurrentAccessPermission();
+            if (! ap.canExtractContent())
+            {
+                throw new IOException("You do not have permission to extract images");
+            }
+
+            for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better
+            {
+                PDPage page = document.getPage(i);
+                ImageGraphicsEngine extractor = new ImageGraphicsEngine(page);
+                extractor.run();
+            }
+        }
+        finally
+        {
+            if (document != null)
+            {
+                document.close();
+            }
         }
     }
 
-    private void processResources(PDResources resources, String prefix, 
-            boolean addKey, boolean directJPEG) throws IOException
+    private class ImageGraphicsEngine extends PDFGraphicsStreamEngine
     {
-        if (resources == null)
+        protected ImageGraphicsEngine(PDPage page) throws IOException
         {
-            return;
+            super(page);
         }
-        Map<String, PDXObject> xobjects = resources.getXObjects();
-        if( xobjects != null )
+
+        public void run() throws IOException
         {
-            Iterator<String> xobjectIter = xobjects.keySet().iterator();
-            while( xobjectIter.hasNext() )
+            PDPage page = getPage();
+            if (page.getContents() != null)
             {
-                String key = xobjectIter.next();
-                PDXObject xobject = xobjects.get( key );
-                // write the images
-                if (xobject instanceof PDImageXObject)
-                {
-                    if (seen.contains(xobject.getCOSStream()))
-                    {
-                        // skip duplicate image
-                        continue;
-                    }
-                    seen.add(xobject.getCOSStream());
-
-                    PDImageXObject image = (PDImageXObject)xobject;
-                    String name = null;
-                    if (addKey) 
-                    {
-                        name = prefix + "-" + imageCounter + "_" + key;
-                    }
-                    else 
-                    {
-                        name = prefix + "-" + imageCounter;
-                    }
-                    imageCounter++;
+                PDResources resources = page.findResources();
+                processStream(resources, page.getContents().getStream(), page.findCropBox());
+            }
+            else
+            {
+                initStream(page.findCropBox());
+            }
+        }
 
-                    System.out.println( "Writing image:" + name );
-                    write2file( image, name, directJPEG );
-                }
-                // maybe there are more images embedded in a form object
-                else if (xobject instanceof PDFormXObject)
+        @Override
+        public void drawImage(PDImage pdImage) throws IOException
+        {
+            if (pdImage instanceof PDImageXObject)
+            {
+                PDImageXObject xobject = (PDImageXObject)pdImage;
+                if (seen.contains(xobject.getCOSStream()))
                 {
-                    PDFormXObject xObjectForm = (PDFormXObject)xobject;
-                    PDResources formResources = xObjectForm.getResources();
-                    processResources(formResources, prefix, addKey, directJPEG);
+                    // skip duplicate image
+                    return;
                 }
+                seen.add(xobject.getCOSStream());
             }
+
+            // save image
+            String name = prefix + "-" + imageCounter;
+            imageCounter++;
+
+            System.out.println("Writing image: " + name);
+            write2file(pdImage, name, directJPEG);
+        }
+
+        @Override
+        public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
+                throws IOException
+        {
+
+        }
+
+        @Override
+        public void clip(int windingRule) throws IOException
+        {
+
+        }
+
+        @Override
+        public void moveTo(float x, float y) throws IOException
+        {
+
+        }
+
+        @Override
+        public void lineTo(float x, float y) throws IOException
+        {
+
+        }
+
+        @Override
+        public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
+                throws IOException
+        {
+
+        }
+
+        @Override
+        public Point2D getCurrentPoint() throws IOException
+        {
+            return new Point2D.Float(0, 0);
+        }
+
+        @Override
+        public void closePath() throws IOException
+        {
+
+        }
+
+        @Override
+        public void endPath() throws IOException
+        {
+
+        }
+
+        @Override
+        public void strokePath() throws IOException
+        {
+
+        }
+
+        @Override
+        public void fillPath(int windingRule) throws IOException
+        {
+
+        }
+
+        @Override
+        public void fillAndStrokePath(int windingRule) throws IOException
+        {
+
+        }
+
+        @Override
+        public void shadingFill(COSName shadingName) throws IOException
+        {
+
         }
-        resources.clearCache();
-    }
-    
-    // get and write the unmodified JPEG stream
-    private void writeJpeg2OutputStream(PDImageXObject ximage, OutputStream out)
-            throws IOException
-    {
-        InputStream data = ximage.getPDStream().getPartiallyFilteredStream(DCT_FILTERS);        
-        IOUtils.copy(data, out);
-        IOUtils.closeQuietly(data);
     }
 
     /**
@@ -270,48 +328,45 @@ public class ExtractImages
      * @param filename the filename
      * @throws IOException When somethings wrong with the corresponding file.
      */
-    private void write2file(PDImageXObject xobj, String filename, boolean directJPEG) throws IOException
+    private void write2file(PDImage pdImage, String filename, boolean directJPEG) throws IOException
     {
-        if (xobj.getSuffix() == null || xobj.getSuffix().isEmpty())
+        String suffix = pdImage.getSuffix();
+        if (suffix == null)
         {
-            System.err.println ("image has no suffix, skipped");
-            System.err.println ("filter(s): " + xobj.getCOSStream().getFilters());
-            return;
+            suffix = "png";
         }
 
         FileOutputStream out = null;
         try
         {
-            out = new FileOutputStream(filename + "." + xobj.getSuffix());
-            BufferedImage image = xobj.getImage();
+            out = new FileOutputStream(filename + "." + suffix);
+            BufferedImage image = pdImage.getImage();
             if (image != null)
             {
-                if ("tiff".equals(xobj.getSuffix()))
+                if ("tiff".equals(suffix))
                 {
-                    TIFFInputStream.writeToOutputStream(xobj, out);
+                    TIFFInputStream.writeToOutputStream(pdImage, out);
                 }
-                else if ("jpg".equals(xobj.getSuffix()))
+                else if ("jpg".equals(suffix))
                 {
-                    String colorSpaceName = xobj.getColorSpace().getName();
-                    if (directJPEG ||
-                            PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
-                            PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))
+                    String colorSpaceName = pdImage.getColorSpace().getName();
+                    if (directJPEG || PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
+                                      PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))
                     {
-                        // directJPEG option, RGB or Gray colorspace:
-                        // get and write the unmodified JPEG stream
-                        writeJpeg2OutputStream(xobj, out);
+                        // RGB or Gray colorspace: get and write the unmodifiedJPEG stream
+                        InputStream data = pdImage.getStream().getPartiallyFilteredStream(JPEG);
+                        IOUtils.copy(data, out);
+                        IOUtils.closeQuietly(data);
                     }
                     else
                     {
-                        // CMYK and other "unusual" colorspaces
-                        // create BufferedImage with correct colors and then save into a 
-                        // JPEG (some quality loss)
-                        ImageIOUtil.writeImage(xobj.getImage(), xobj.getSuffix(), out);
+                        // for CMYK and other "unusual" colorspaces, the JPEG will be converted
+                        ImageIOUtil.writeImage(image, suffix, out);
                     }
                 }
                 else 
                 {
-                    ImageIOUtil.writeImage(image, xobj.getSuffix(), out);
+                    ImageIOUtil.writeImage(image, suffix, out);
                 }
             }
             out.flush();
@@ -324,21 +379,4 @@ public class ExtractImages
             }
         }
     }
-
-    /**
-     * This will print the usage requirements and exit.
-     */
-    private static void usage()
-    {
-        System.err.println( "Usage: java org.apache.pdfbox.tools.ExtractImages [OPTIONS] <PDF file>\n" +
-            "  -password  <password>        Password to decrypt document\n" +
-            "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n" +
-            "  -addkey                      add the internal image key to the file name\n" +
-            "  -nonSeq                      Enables the new non-sequential parser\n" +
-            "  -directJPEG                  Forces the direct extraction of JPEG images regardless of colorspace\n" +
-            "  <PDF file>                   The PDF document to use\n"
-            );
-        System.exit( 1 );
-    }
-
 }