You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2016/01/05 08:42:04 UTC

svn commit: r1723009 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox: pdmodel/graphics/image/ util/filetypedetector/

Author: tilman
Date: Tue Jan  5 07:42:03 2016
New Revision: 1723009

URL: http://svn.apache.org/viewvc?rev=1723009&view=rev
Log:
PDFBOX-3163: added creating PDImageXObject by content as suggested by Benjamin Gamard and Maciej Wo?niak, with Apache 2 licensed content detection code from Drew Noakes

Added:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/ByteTrie.java   (with props)
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileType.java   (with props)
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileTypeDetector.java   (with props)
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java?rev=1723009&r1=1723008&r2=1723009&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java Tue Jan  5 07:42:03 2016
@@ -21,6 +21,7 @@ import java.awt.Paint;
 import java.awt.RenderingHints;
 import java.awt.image.BufferedImage;
 import java.awt.image.WritableRaster;
+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -44,6 +45,8 @@ import org.apache.pdfbox.pdmodel.common.
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
+import org.apache.pdfbox.util.filetypedetector.FileType;
+import org.apache.pdfbox.util.filetypedetector.FileTypeDetector;
 
 /**
  * An Image XObject.
@@ -145,7 +148,7 @@ public final class PDImageXObject extend
     }
     
     /**
-     * Create a PDImageXObject from an image file, see {@link #createFromFile(File, PDDocument)} for
+     * Create a PDImageXObject from an image file, see {@link #createFromFileByExtension(File, PDDocument)} for
      * more details.
      *
      * @param imagePath the image file path.
@@ -156,7 +159,7 @@ public final class PDImageXObject extend
      */
     public static PDImageXObject createFromFile(String imagePath, PDDocument doc) throws IOException
     {
-        return createFromFile(new File(imagePath), doc);
+        return createFromFileByExtension(new File(imagePath), doc);
     }
 
     /**
@@ -174,13 +177,13 @@ public final class PDImageXObject extend
      * PDImageXObject.
      * @throws IllegalArgumentException if the image type is not supported.
      */
-    public static PDImageXObject createFromFile(File file, PDDocument doc) throws IOException
+    public static PDImageXObject createFromFileByExtension(File file, PDDocument doc) throws IOException
     {
         String name = file.getName();
         int dot = file.getName().lastIndexOf('.');
         if (dot == -1)
         {
-            throw new IOException("Image type not supported: " + name);
+            throw new IllegalArgumentException("Image type not supported: " + name);
         }
         String ext = name.substring(dot + 1).toLowerCase();
         if ("jpg".equals(ext) || "jpeg".equals(ext))
@@ -196,7 +199,63 @@ public final class PDImageXObject extend
             BufferedImage bim = ImageIO.read(file);
             return LosslessFactory.createFromImage(doc, bim);
         }
-        throw new IOException("Image type not supported: " + name);
+        throw new IllegalArgumentException("Image type not supported: " + name);
+    }
+
+    /**
+     * Create a PDImageXObject from an image file. The file format is determined by the file
+     * content. The following file types are supported: jpg, jpeg, tif, tiff, gif, bmp and png. This
+     * is a convenience method that calls {@link JPEGFactory#createFromStream},
+     * {@link CCITTFactory#createFromFile} or {@link ImageIO#read} combined with
+     * {@link LosslessFactory#createFromImage}. (The later can also be used to create a
+     * PDImageXObject from a BufferedImage).
+     *
+     * @param file the image file.
+     * @param doc the document that shall use this PDImageXObject.
+     * @return a PDImageXObject.
+     * @throws IOException if there is an error when reading the file or creating the
+     * PDImageXObject.
+     * @throws IllegalArgumentException if the image type is not supported.
+     */
+    public static PDImageXObject createFromFileByContent(File file, PDDocument doc) throws IOException
+    {
+        FileInputStream fileInputStream = null;
+        BufferedInputStream bufferedInputStream = null;
+        FileType fileType = null;
+        try
+        {
+            fileInputStream = new FileInputStream(file);
+            bufferedInputStream = new BufferedInputStream(fileInputStream);
+            fileType = FileTypeDetector.detectFileType(bufferedInputStream);
+        }
+        catch (IOException e)
+        {
+            throw new IOException("Could not determine file type: " + file.getName(), e);
+        }
+        finally
+        {
+            IOUtils.closeQuietly(fileInputStream);
+            IOUtils.closeQuietly(bufferedInputStream);
+        }
+        if (fileType == null)
+        {
+            throw new IllegalArgumentException("Image type not supported: " + file.getName());
+        }
+
+        if (fileType.equals(FileType.JPEG))
+        {
+            return JPEGFactory.createFromStream(doc, new FileInputStream(file));
+        }
+        if (fileType.equals(FileType.TIFF))
+        {
+            return CCITTFactory.createFromFile(doc, file);
+        }
+        if (fileType.equals(FileType.BMP) || fileType.equals(FileType.GIF) || fileType.equals(FileType.PNG))
+        {
+            BufferedImage bim = ImageIO.read(file);
+            return LosslessFactory.createFromImage(doc, bim);
+        }
+        throw new IllegalArgumentException("Image type not supported: " + file.getName());
     }
 
     // repairs parameters using decode result

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/ByteTrie.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/ByteTrie.java?rev=1723009&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/ByteTrie.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/ByteTrie.java Tue Jan  5 07:42:03 2016
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util.filetypedetector;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * @author Drew Noakes
+ *
+ * code taken from https://github.com/drewnoakes/metadata-extractor
+ *
+ * 2016-01-04
+ *
+ * latest commit number 73f1a48
+ *
+ * Stores values using a prefix tree (aka 'trie', i.e. reTRIEval data structure).
+ *
+ * @param <T> the type of value to store for byte sequences
+ */
+class ByteTrie<T>
+{
+    /**
+     * A node in the trie. Has children and may have an associated value.
+     */
+    static class ByteTrieNode<T>
+    {
+        private final Map<Byte, ByteTrieNode<T>> children = new HashMap<Byte, ByteTrieNode<T>>();
+        private T value = null;
+
+        public void setValue(T value)
+        {
+            if (this.value != null)
+            {
+                throw new IllegalStateException("Value already set for this trie node");
+            }
+            this.value = value;
+        }
+
+        public T getValue()
+        {
+            return value;
+        }
+    }
+
+    private final ByteTrieNode<T> root = new ByteTrieNode<T>();
+    private int maxDepth;
+
+    /**
+     * Return the most specific value stored for this byte sequence. If not found, returns
+     * <code>null</code> or a default values as specified by calling
+     * {@link ByteTrie#setDefaultValue}.
+     * @param bytes
+     * @return 
+     */
+    public T find(byte[] bytes)
+    {
+        ByteTrieNode<T> node = root;
+        T val = node.getValue();
+        for (byte b : bytes)
+        {
+            ByteTrieNode<T> child = node.children.get(b);
+            if (child == null)
+            {
+                break;
+            }
+            node = child;
+            if (node.getValue() != null)
+            {
+                val = node.getValue();
+            }
+        }
+        return val;
+    }
+
+    /**
+     * Store the given value at the specified path.
+     * @param value
+     * @param parts
+     */
+    public void addPath(T value, byte[]... parts)
+    {
+        int depth = 0;
+        ByteTrieNode<T> node = root;
+        for (byte[] part : parts)
+        {
+            for (byte b : part)
+            {
+                ByteTrieNode<T> child = node.children.get(b);
+                if (child == null)
+                {
+                    child = new ByteTrieNode<T>();
+                    node.children.put(b, child);
+                }
+                node = child;
+                depth++;
+            }
+        }
+        node.setValue(value);
+        maxDepth = Math.max(maxDepth, depth);
+    }
+
+    /**
+     * Sets the default value to use in {@link ByteTrie#find(byte[])} when no path matches.
+     * @param defaultValue
+     */
+    public void setDefaultValue(T defaultValue)
+    {
+        root.setValue(defaultValue);
+    }
+
+    /**
+     * Gets the maximum depth stored in this trie.
+     * @return 
+     */
+    public int getMaxDepth()
+    {
+        return maxDepth;
+    }
+}

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/ByteTrie.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileType.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileType.java?rev=1723009&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileType.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileType.java Tue Jan  5 07:42:03 2016
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util.filetypedetector;
+
+/**
+ * @author Drew Noakes
+ *
+ * code taken from https://github.com/drewnoakes/metadata-extractor
+ *
+ * 2016-01-04
+ *
+ * latest commit number 73f1a48
+ *
+ * Enumeration of supported image file formats.
+ */
+public enum FileType
+{
+    UNKNOWN,
+    JPEG,
+    TIFF,
+    PSD,
+    PNG,
+    BMP,
+    GIF,
+    ICO,
+    PCX,
+    RIFF,
+
+    /** Sony camera raw. */
+    ARW,
+    /** Canon camera raw, version 1. */
+    CRW,
+    /** Canon camera raw, version 2. */
+    CR2,
+    /** Nikon camera raw. */
+    NEF,
+    /** Olympus camera raw. */
+    ORF,
+    /** FujiFilm camera raw. */
+    RAF,
+    /** Panasonic camera raw. */
+    RW2
+}
\ No newline at end of file

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileType.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileTypeDetector.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileTypeDetector.java?rev=1723009&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileTypeDetector.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileTypeDetector.java Tue Jan  5 07:42:03 2016
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util.filetypedetector;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+
+/**
+ * @author Drew Noakes
+ *
+ * code taken from https://github.com/drewnoakes/metadata-extractor
+ *
+ * 2016-01-04
+ *
+ * latest commit number 73f1a48
+ *
+ * Examines the a file's first bytes and estimates the file's type.
+ */
+public class FileTypeDetector
+{
+    private final static ByteTrie<FileType> root;
+
+    static
+    {
+        root = new ByteTrie<FileType>();
+        root.setDefaultValue(FileType.UNKNOWN);
+
+        // https://en.wikipedia.org/wiki/List_of_file_signatures
+
+        root.addPath(FileType.JPEG, new byte[]{(byte)0xff, (byte)0xd8});
+        root.addPath(FileType.TIFF, "II".getBytes(), new byte[]{0x2a, 0x00});
+        root.addPath(FileType.TIFF, "MM".getBytes(), new byte[]{0x00, 0x2a});
+        root.addPath(FileType.PSD, "8BPS".getBytes());
+        root.addPath(FileType.PNG, new byte[]{(byte)0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52});
+        // TODO technically there are other very rare magic numbers for OS/2 BMP files...
+        root.addPath(FileType.BMP, "BM".getBytes()); 
+        root.addPath(FileType.GIF, "GIF87a".getBytes());
+        root.addPath(FileType.GIF, "GIF89a".getBytes());
+        root.addPath(FileType.ICO, new byte[]{0x00, 0x00, 0x01, 0x00});
+        // multiple PCX versions, explicitly listed
+        root.addPath(FileType.PCX, new byte[]{0x0A, 0x00, 0x01}); 
+        root.addPath(FileType.PCX, new byte[]{0x0A, 0x02, 0x01});
+        root.addPath(FileType.PCX, new byte[]{0x0A, 0x03, 0x01});
+        root.addPath(FileType.PCX, new byte[]{0x0A, 0x05, 0x01});
+        root.addPath(FileType.RIFF, "RIFF".getBytes());
+
+        root.addPath(FileType.ARW, "II".getBytes(), new byte[]{0x2a, 0x00, 0x08, 0x00});
+        root.addPath(FileType.CRW, "II".getBytes(), new byte[]{0x1a, 0x00, 0x00, 0x00}, "HEAPCCDR".getBytes());
+        root.addPath(FileType.CR2, "II".getBytes(), new byte[]{0x2a, 0x00, 0x10, 0x00, 0x00, 0x00, 0x43, 0x52});
+        root.addPath(FileType.NEF, "MM".getBytes(), new byte[]{0x00, 0x2a, 0x00, 0x00, 0x00, (byte)0x80, 0x00});
+        root.addPath(FileType.ORF, "IIRO".getBytes(), new byte[]{(byte)0x08, 0x00});
+        root.addPath(FileType.ORF, "IIRS".getBytes(), new byte[]{(byte)0x08, 0x00});
+        root.addPath(FileType.RAF, "FUJIFILMCCD-RAW".getBytes());
+        root.addPath(FileType.RW2, "II".getBytes(), new byte[]{0x55, 0x00});
+    }
+
+    private FileTypeDetector() throws Exception
+    {
+        throw new Exception("Not intended for instantiation");
+    }
+
+    /**
+     * Examines the a file's first bytes and estimates the file's type.
+     * <p>
+     * Requires a {@link BufferedInputStream} in order to mark and reset the stream to the position
+     * at which it was provided to this method once completed.
+     * <p>
+     * Requires the stream to contain at least eight bytes.
+     *
+     * @throws IOException if an IO error occurred or the input stream ended unexpectedly.
+     */
+    public static FileType detectFileType(final BufferedInputStream inputStream) throws IOException
+    {
+        if (!inputStream.markSupported())
+        {
+            throw new IOException("Stream must support mark/reset");
+        }
+
+        int maxByteCount = root.getMaxDepth();
+
+        inputStream.mark(maxByteCount);
+
+        byte[] bytes = new byte[maxByteCount];
+        int bytesRead = inputStream.read(bytes);
+
+        if (bytesRead == -1)
+        {
+            throw new IOException("Stream ended before file's magic number could be determined.");
+        }
+
+        inputStream.reset();
+
+        //noinspection ConstantConditions
+        return root.find(bytes);
+    }
+}

Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/filetypedetector/FileTypeDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native