You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 00:54:49 UTC

[1/5] tika git commit: fix indentation

Repository: tika
Updated Branches:
  refs/heads/2.x 5bc597dc8 -> dd3c2a486


fix indentation


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/865c45cd
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/865c45cd
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/865c45cd

Branch: refs/heads/2.x
Commit: 865c45cd569f680899cd2ede32987b1bf3f8a86e
Parents: 5bc597d
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:11:07 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:11:07 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/io/EndianUtils.java    | 831 ++++++++++---------
 1 file changed, 421 insertions(+), 410 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/865c45cd/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 3416f55..2ab85b3 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -25,418 +25,429 @@ import org.apache.tika.exception.TikaException;
  * General Endian Related Utilties.
  * <p>
  * This class provides static utility methods for input/output operations
- *  on numbers in Big and Little Endian formats.
+ * on numbers in Big and Little Endian formats.
  * <p>
  * Origin of code: Based on the version in POI
  */
 public class EndianUtils {
-   /**
-    * Get a LE short value from an InputStream
-    *
-    * @param  stream the InputStream from which the short is to be read
-    * @return                              the short (16-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static short readShortLE(InputStream stream) throws IOException, BufferUnderrunException {
-      return (short) readUShortLE(stream);
-   }
-   /**
-    * Get a BE short value from an InputStream
-    *
-    * @param  stream the InputStream from which the short is to be read
-    * @return                              the short (16-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static short readShortBE(InputStream stream) throws IOException, BufferUnderrunException {
-      return (short) readUShortBE(stream);
-   }
-
-   public static int readUShortLE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      if ((ch1 | ch2) < 0) {
-         throw new BufferUnderrunException();
-      }
-      return (ch2 << 8) + (ch1 << 0);
-   }
-   public static int readUShortBE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      if ((ch1 | ch2) < 0) {
-         throw new BufferUnderrunException();
-      }
-      return (ch1 << 8) + (ch2 << 0);
-   }
-
-
-   /**
-    * Get a LE unsigned int value from an InputStream
-    *
-    * @param  stream the InputStream from which the int is to be read
-    * @return                              the int (32-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      int ch3 = stream.read();
-      int ch4 = stream.read();
-      if ((ch1 | ch2 | ch3 | ch4) < 0) {
-         throw new BufferUnderrunException();
-      }
-      return ((ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0)) & 0x00FFFFFFFFl;
-   }
-
-   /**
-    * Get a LE int value from an InputStream
-    *
-    * @param  stream the InputStream from which the int is to be read
-    * @return                              the int (32-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      int ch3 = stream.read();
-      int ch4 = stream.read();
-      if ((ch1 | ch2 | ch3 | ch4) < 0) {
-         throw new BufferUnderrunException();
-      }
-      return (ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0);
-   }
-   /**
-    * Get a BE int value from an InputStream
-    *
-    * @param  stream the InputStream from which the int is to be read
-    * @return                              the int (32-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      int ch3 = stream.read();
-      int ch4 = stream.read();
-      if ((ch1 | ch2 | ch3 | ch4) < 0) {
-         throw new BufferUnderrunException();
-      }
-      return (ch1 << 24) + (ch2<<16) + (ch3 << 8) + (ch4 << 0);
-   }
-
-   /**
-    * Get a LE long value from an InputStream
-    *
-    * @param  stream the InputStream from which the long is to be read
-    * @return                              the long (64-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      int ch3 = stream.read();
-      int ch4 = stream.read();
-      int ch5 = stream.read();
-      int ch6 = stream.read();
-      int ch7 = stream.read();
-      int ch8 = stream.read();
-      if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
-         throw new BufferUnderrunException();
-      }
-
-      return
-      ((long)ch8 << 56) +
-      ((long)ch7 << 48) +
-      ((long)ch6 << 40) +
-      ((long)ch5 << 32) +
-      ((long)ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
-      (ch3 << 16) +
-      (ch2 <<  8) +
-      (ch1 <<  0);
-   }
-   /**
-    * Get a NE long value from an InputStream
-    *
-    * @param  stream the InputStream from which the long is to be read
-    * @return                              the long (64-bit) value
-    * @exception  IOException              will be propagated back to the caller
-    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
-    */
-   public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException {
-      int ch1 = stream.read();
-      int ch2 = stream.read();
-      int ch3 = stream.read();
-      int ch4 = stream.read();
-      int ch5 = stream.read();
-      int ch6 = stream.read();
-      int ch7 = stream.read();
-      int ch8 = stream.read();
-      if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
-         throw new BufferUnderrunException();
-      }
-
-      return
-      ((long)ch1 << 56) +
-      ((long)ch2 << 48) +
-      ((long)ch3 << 40) +
-      ((long)ch4 << 32) +
-      ((long)ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
-      (ch6 << 16) +
-      (ch7 <<  8) +
-      (ch8 <<  0);
-   }
-   
-   /**
-    * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian
-    *   but with the high bit on each number indicating if it continues or not
-    */
-   public static long readUE7(InputStream stream) throws IOException {
-       int i;
-       long v = 0;
-       while ((i = stream.read()) >= 0) {
-           v = v << 7;
-           if ((i & 128) == 128) {
-               // Continues
-               v += (i&127);
-           } else {
-               // Last value
-               v += i;
-               break;
-           }
-       }
-       return v;
-   }
-   
-   
-   /**
-    * Get a LE short value from the beginning of a byte array
-    *
-    *@param  data  the byte array
-    *@return       the short (16-bit) value
-    */
-   public static short getShortLE(byte[] data) {
-      return getShortLE(data, 0);
-   }
-   /**
-    * Get a LE short value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the short (16-bit) value
-    */
-   public static short getShortLE(byte[] data, int offset) {
-      return (short)getUShortLE(data, offset);
-   }
-
-   /**
-    * Get a LE unsigned short value from the beginning of a byte array
-    *
-    *@param  data  the byte array
-    *@return       the unsigned short (16-bit) value in an int
-    */
-   public static int getUShortLE(byte[] data) {
-      return getUShortLE(data, 0);
-   }
-   /**
-    * Get a LE unsigned short value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the unsigned short (16-bit) value in an integer
-    */
-   public static int getUShortLE(byte[] data, int offset) {
-      int b0 = data[offset] & 0xFF;
-      int b1 = data[offset+1] & 0xFF;
-      return (b1 << 8) + (b0 << 0);
-   }
-   
-   /**
-    * Get a BE short value from the beginning of a byte array
-    *
-    *@param  data  the byte array
-    *@return       the short (16-bit) value
-    */
-   public static short getShortBE(byte[] data) {
-      return getShortBE(data, 0);
-   }
-   /**
-    * Get a BE short value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the short (16-bit) value
-    */
-   public static short getShortBE(byte[] data, int offset) {
-      return (short)getUShortBE(data, offset);
-   }
-
-   /**
-    * Get a BE unsigned short value from the beginning of a byte array
-    *
-    *@param  data  the byte array
-    *@return       the unsigned short (16-bit) value in an int
-    */
-   public static int getUShortBE(byte[] data) {
-      return getUShortBE(data, 0);
-   }
-   /**
-    * Get a BE unsigned short value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the unsigned short (16-bit) value in an integer
-    */
-   public static int getUShortBE(byte[] data, int offset) {
-      int b0 = data[offset] & 0xFF;
-      int b1 = data[offset+1] & 0xFF;
-      return (b0 << 8) + (b1 << 0);
-   }
-
-   /**
-    * Get a LE int value from the beginning of a byte array
-    *
-    *@param  data  the byte array
-    *@return the int (32-bit) value
-    */
-   public static int getIntLE(byte[] data) {
-       return getIntLE(data, 0);
-   }
-   /**
-    * Get a LE int value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the int (32-bit) value
-    */
-   public static int getIntLE(byte[] data, int offset) {
-       int i=offset;
-       int b0 = data[i++] & 0xFF;
-       int b1 = data[i++] & 0xFF;
-       int b2 = data[i++] & 0xFF;
-       int b3 = data[i++] & 0xFF;
-       return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0);
-   }
-
-   /**
-    * Get a BE int value from the beginning of a byte array
-    *
-    *@param  data  the byte array
-    *@return the int (32-bit) value
-    */
-   public static int getIntBE(byte[] data) {
-       return getIntBE(data, 0);
-   }
-   /**
-    * Get a BE int value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the int (32-bit) value
-    */
-   public static int getIntBE(byte[] data, int offset) {
-       int i=offset;
-       int b0 = data[i++] & 0xFF;
-       int b1 = data[i++] & 0xFF;
-       int b2 = data[i++] & 0xFF;
-       int b3 = data[i++] & 0xFF;
-       return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
-   }
-
-   /**
-    * Get a LE unsigned int value from a byte array
-    *
-    *@param  data    the byte array
-    *@return         the unsigned int (32-bit) value in a long
-    */
-   public static long getUIntLE(byte[] data) {
-       return getUIntLE(data,0);
-   }
-   /**
-    * Get a LE unsigned int value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the unsigned int (32-bit) value in a long
-    */
-   public static long getUIntLE(byte[] data, int offset) {
-       long retNum = getIntLE(data, offset);
-       return retNum & 0x00FFFFFFFFl;
-   }
-
-   /**
-    * Get a BE unsigned int value from a byte array
-    *
-    *@param  data    the byte array
-    *@return         the unsigned int (32-bit) value in a long
-    */
-   public static long getUIntBE(byte[] data) {
-       return getUIntBE(data,0);
-   }
-   /**
-    * Get a BE unsigned int value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the unsigned int (32-bit) value in a long
-    */
-   public static long getUIntBE(byte[] data, int offset) {
-       long retNum = getIntBE(data, offset);
-       return retNum & 0x00FFFFFFFFl;
-   }
-
-   /**
-    * Get a LE long value from a byte array
-    *
-    *@param  data    the byte array
-    *@param  offset  a starting offset into the byte array
-    *@return         the long (64-bit) value
-    */
-   public static long getLongLE(byte[] data, int offset) {
-      long result = 0;
-
-      for (int j = offset + LONG_SIZE - 1; j >= offset; j--) {
-         result <<= 8;
-         result |= 0xff & data[j];
-      }
-      return result;
-   }
-   private static final int LONG_SIZE = 8;
-
-   
-   /**
-    *  Convert an 'unsigned' byte to an integer. ie, don't carry across the
-    *  sign.
-    *
-    * @param  b  Description of the Parameter
-    * @return    Description of the Return Value
-    */
-   public static int ubyteToInt(byte b) {
-      return b & 0xFF;
-   }
-
-   /**
-    * get the unsigned value of a byte.
-    * 
-    * @param data
-    *            the byte array.
-    * @param offset
-    *            a starting offset into the byte array.
-    * @return the unsigned value of the byte as a 16 bit short
-    */
-   public static short getUByte( byte[] data, int offset )
-   {
-      return (short) ( data[offset] & 0xFF );
-   }
-   
-   
-   public static class BufferUnderrunException extends TikaException {
-      private static final long serialVersionUID = 8358288231138076276L;
-      public BufferUnderrunException() {
-         super("Insufficient data left in stream for required read");
-      }
-   }
+    /**
+     * Get a LE short value from an InputStream
+     *
+     * @param stream the InputStream from which the short is to be read
+     * @return the short (16-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static short readShortLE(InputStream stream) throws IOException, BufferUnderrunException {
+        return (short) readUShortLE(stream);
+    }
+
+    /**
+     * Get a BE short value from an InputStream
+     *
+     * @param stream the InputStream from which the short is to be read
+     * @return the short (16-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static short readShortBE(InputStream stream) throws IOException, BufferUnderrunException {
+        return (short) readUShortBE(stream);
+    }
+
+    public static int readUShortLE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        if ((ch1 | ch2) < 0) {
+            throw new BufferUnderrunException();
+        }
+        return (ch2 << 8) + (ch1 << 0);
+    }
+
+    public static int readUShortBE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        if ((ch1 | ch2) < 0) {
+            throw new BufferUnderrunException();
+        }
+        return (ch1 << 8) + (ch2 << 0);
+    }
+
+
+    /**
+     * Get a LE unsigned int value from an InputStream
+     *
+     * @param stream the InputStream from which the int is to be read
+     * @return the int (32-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        int ch3 = stream.read();
+        int ch4 = stream.read();
+        if ((ch1 | ch2 | ch3 | ch4) < 0) {
+            throw new BufferUnderrunException();
+        }
+        return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0)) & 0x00FFFFFFFFl;
+    }
+
+    /**
+     * Get a LE int value from an InputStream
+     *
+     * @param stream the InputStream from which the int is to be read
+     * @return the int (32-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        int ch3 = stream.read();
+        int ch4 = stream.read();
+        if ((ch1 | ch2 | ch3 | ch4) < 0) {
+            throw new BufferUnderrunException();
+        }
+        return (ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0);
+    }
+
+    /**
+     * Get a BE int value from an InputStream
+     *
+     * @param stream the InputStream from which the int is to be read
+     * @return the int (32-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        int ch3 = stream.read();
+        int ch4 = stream.read();
+        if ((ch1 | ch2 | ch3 | ch4) < 0) {
+            throw new BufferUnderrunException();
+        }
+        return (ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0);
+    }
+
+    /**
+     * Get a LE long value from an InputStream
+     *
+     * @param stream the InputStream from which the long is to be read
+     * @return the long (64-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        int ch3 = stream.read();
+        int ch4 = stream.read();
+        int ch5 = stream.read();
+        int ch6 = stream.read();
+        int ch7 = stream.read();
+        int ch8 = stream.read();
+        if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
+            throw new BufferUnderrunException();
+        }
+
+        return
+                ((long) ch8 << 56) +
+                        ((long) ch7 << 48) +
+                        ((long) ch6 << 40) +
+                        ((long) ch5 << 32) +
+                        ((long) ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
+                        (ch3 << 16) +
+                        (ch2 << 8) +
+                        (ch1 << 0);
+    }
+
+    /**
+     * Get a NE long value from an InputStream
+     *
+     * @param stream the InputStream from which the long is to be read
+     * @return the long (64-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        int ch3 = stream.read();
+        int ch4 = stream.read();
+        int ch5 = stream.read();
+        int ch6 = stream.read();
+        int ch7 = stream.read();
+        int ch8 = stream.read();
+        if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
+            throw new BufferUnderrunException();
+        }
+
+        return
+                ((long) ch1 << 56) +
+                        ((long) ch2 << 48) +
+                        ((long) ch3 << 40) +
+                        ((long) ch4 << 32) +
+                        ((long) ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
+                        (ch6 << 16) +
+                        (ch7 << 8) +
+                        (ch8 << 0);
+    }
+
+    /**
+     * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian
+     * but with the high bit on each number indicating if it continues or not
+     */
+    public static long readUE7(InputStream stream) throws IOException {
+        int i;
+        long v = 0;
+        while ((i = stream.read()) >= 0) {
+            v = v << 7;
+            if ((i & 128) == 128) {
+                // Continues
+                v += (i & 127);
+            } else {
+                // Last value
+                v += i;
+                break;
+            }
+        }
+        return v;
+    }
+
+
+    /**
+     * Get a LE short value from the beginning of a byte array
+     *
+     * @param data the byte array
+     * @return the short (16-bit) value
+     */
+    public static short getShortLE(byte[] data) {
+        return getShortLE(data, 0);
+    }
+
+    /**
+     * Get a LE short value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the short (16-bit) value
+     */
+    public static short getShortLE(byte[] data, int offset) {
+        return (short) getUShortLE(data, offset);
+    }
+
+    /**
+     * Get a LE unsigned short value from the beginning of a byte array
+     *
+     * @param data the byte array
+     * @return the unsigned short (16-bit) value in an int
+     */
+    public static int getUShortLE(byte[] data) {
+        return getUShortLE(data, 0);
+    }
+
+    /**
+     * Get a LE unsigned short value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the unsigned short (16-bit) value in an integer
+     */
+    public static int getUShortLE(byte[] data, int offset) {
+        int b0 = data[offset] & 0xFF;
+        int b1 = data[offset + 1] & 0xFF;
+        return (b1 << 8) + (b0 << 0);
+    }
+
+    /**
+     * Get a BE short value from the beginning of a byte array
+     *
+     * @param data the byte array
+     * @return the short (16-bit) value
+     */
+    public static short getShortBE(byte[] data) {
+        return getShortBE(data, 0);
+    }
+
+    /**
+     * Get a BE short value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the short (16-bit) value
+     */
+    public static short getShortBE(byte[] data, int offset) {
+        return (short) getUShortBE(data, offset);
+    }
+
+    /**
+     * Get a BE unsigned short value from the beginning of a byte array
+     *
+     * @param data the byte array
+     * @return the unsigned short (16-bit) value in an int
+     */
+    public static int getUShortBE(byte[] data) {
+        return getUShortBE(data, 0);
+    }
+
+    /**
+     * Get a BE unsigned short value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the unsigned short (16-bit) value in an integer
+     */
+    public static int getUShortBE(byte[] data, int offset) {
+        int b0 = data[offset] & 0xFF;
+        int b1 = data[offset + 1] & 0xFF;
+        return (b0 << 8) + (b1 << 0);
+    }
+
+    /**
+     * Get a LE int value from the beginning of a byte array
+     *
+     * @param data the byte array
+     * @return the int (32-bit) value
+     */
+    public static int getIntLE(byte[] data) {
+        return getIntLE(data, 0);
+    }
+
+    /**
+     * Get a LE int value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the int (32-bit) value
+     */
+    public static int getIntLE(byte[] data, int offset) {
+        int i = offset;
+        int b0 = data[i++] & 0xFF;
+        int b1 = data[i++] & 0xFF;
+        int b2 = data[i++] & 0xFF;
+        int b3 = data[i++] & 0xFF;
+        return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0);
+    }
+
+    /**
+     * Get a BE int value from the beginning of a byte array
+     *
+     * @param data the byte array
+     * @return the int (32-bit) value
+     */
+    public static int getIntBE(byte[] data) {
+        return getIntBE(data, 0);
+    }
+
+    /**
+     * Get a BE int value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the int (32-bit) value
+     */
+    public static int getIntBE(byte[] data, int offset) {
+        int i = offset;
+        int b0 = data[i++] & 0xFF;
+        int b1 = data[i++] & 0xFF;
+        int b2 = data[i++] & 0xFF;
+        int b3 = data[i++] & 0xFF;
+        return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+    }
+
+    /**
+     * Get a LE unsigned int value from a byte array
+     *
+     * @param data the byte array
+     * @return the unsigned int (32-bit) value in a long
+     */
+    public static long getUIntLE(byte[] data) {
+        return getUIntLE(data, 0);
+    }
+
+    /**
+     * Get a LE unsigned int value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the unsigned int (32-bit) value in a long
+     */
+    public static long getUIntLE(byte[] data, int offset) {
+        long retNum = getIntLE(data, offset);
+        return retNum & 0x00FFFFFFFFl;
+    }
+
+    /**
+     * Get a BE unsigned int value from a byte array
+     *
+     * @param data the byte array
+     * @return the unsigned int (32-bit) value in a long
+     */
+    public static long getUIntBE(byte[] data) {
+        return getUIntBE(data, 0);
+    }
+
+    /**
+     * Get a BE unsigned int value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the unsigned int (32-bit) value in a long
+     */
+    public static long getUIntBE(byte[] data, int offset) {
+        long retNum = getIntBE(data, offset);
+        return retNum & 0x00FFFFFFFFl;
+    }
+
+    /**
+     * Get a LE long value from a byte array
+     *
+     * @param data   the byte array
+     * @param offset a starting offset into the byte array
+     * @return the long (64-bit) value
+     */
+    public static long getLongLE(byte[] data, int offset) {
+        long result = 0;
+
+        for (int j = offset + LONG_SIZE - 1; j >= offset; j--) {
+            result <<= 8;
+            result |= 0xff & data[j];
+        }
+        return result;
+    }
+
+    private static final int LONG_SIZE = 8;
+
+
+    /**
+     * Convert an 'unsigned' byte to an integer. ie, don't carry across the
+     * sign.
+     *
+     * @param b Description of the Parameter
+     * @return Description of the Return Value
+     */
+    public static int ubyteToInt(byte b) {
+        return b & 0xFF;
+    }
+
+    /**
+     * get the unsigned value of a byte.
+     *
+     * @param data   the byte array.
+     * @param offset a starting offset into the byte array.
+     * @return the unsigned value of the byte as a 16 bit short
+     */
+    public static short getUByte(byte[] data, int offset) {
+        return (short) (data[offset] & 0xFF);
+    }
+
+
+    public static class BufferUnderrunException extends TikaException {
+        private static final long serialVersionUID = 8358288231138076276L;
+
+        public BufferUnderrunException() {
+            super("Insufficient data left in stream for required read");
+        }
+    }
 }

[4/5] tika git commit: rm inconsistently capitalized test files

Posted by ta...@apache.org.

rm inconsistently capitalized test files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/933af20e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/933af20e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/933af20e

Branch: refs/heads/2.x
Commit: 933af20e84cf088d80348b49eef7cbe4732a9eb3
Parents: e62f230
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:37:05 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:37:05 2016 -0400

----------------------------------------------------------------------
 .../test-documents/testExcel_embeddedPDF.xls      | Bin 38400 -> 0 bytes
 .../test-documents/testExcel_embeddedPDF.xlsx     | Bin 25602 -> 0 bytes
 .../test-documents/testPPT_EmbeddedPDF.ppt        | Bin 187392 -> 0 bytes
 .../test-documents/testPPT_EmbeddedPDF.pptx       | Bin 108637 -> 0 bytes
 4 files changed, 0 insertions(+), 0 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
deleted file mode 100644
index c38f64c..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls and /dev/null differ

http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
deleted file mode 100644
index 9c0d2b9..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx and /dev/null differ

http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
deleted file mode 100644
index 3129be1..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt and /dev/null differ

http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
deleted file mode 100644
index a96aa3c..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx and /dev/null differ

[5/5] tika git commit: TIKA-2026 -- improve extraction of attachments for PPT, PPTX, XLSX

Posted by ta...@apache.org.

TIKA-2026 -- improve extraction of attachments for PPT, PPTX, XLSX


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dd3c2a48
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dd3c2a48
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dd3c2a48

Branch: refs/heads/2.x
Commit: dd3c2a486a41903d5ebeb4bf341be29e02af8499
Parents: 933af20
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:54:40 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:54:40 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   5 +++-
 .../microsoft/AbstractPOIFSExtractor.java       |  19 ++++++++++----
 .../tika/parser/microsoft/HSLFExtractor.java    |  18 ++++++++++---
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   3 +--
 .../tika/parser/microsoft/ExcelParserTest.java  |  13 +++++++---
 .../parser/microsoft/PowerPointParserTest.java  |  14 ++++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  26 ++++++++++++++++---
 .../test-documents/testEXCEL_embeddedPDF.xls    | Bin 0 -> 38400 bytes
 .../test-documents/testEXCEL_embeddedPDF.xlsx   | Bin 0 -> 25602 bytes
 .../test-documents/testPPT_embeddedPDF.ppt      | Bin 0 -> 187392 bytes
 .../test-documents/testPPT_embeddedPDF.pptx     | Bin 0 -> 108637 bytes
 11 files changed, 78 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 766780f..64e1f53 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,7 +17,10 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
-  * Add parser for applefile (AppleSingle) (TIKA-2022)
+  * Improve extraction of embedded documents for PPT, PPTX and XLSX
+    (TIKA-2026).
+
+  * Add parser for applefile (AppleSingle) (TIKA-2022).
 
   * Add mime types, mime magic and/or globs for:
      * Endnote Import File (TIKA-2011)

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 1225288..739af69 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -152,6 +152,15 @@ abstract class AbstractPOIFSExtractor {
     protected void handleEmbeddedOfficeDoc(
             DirectoryEntry dir, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
+        handleEmbeddedOfficeDoc(dir, null, xhtml);
+    }
+
+    /**
+     * Handle an office document that's embedded at the POIFS level
+     */
+    protected void handleEmbeddedOfficeDoc(
+            DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
 
         // Is it an embedded OLE2 document, or an embedded OOXML document?
 
@@ -177,21 +186,21 @@ abstract class AbstractPOIFSExtractor {
         }
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         TikaInputStream embedded = null;
-
+        String rName = (resourceName == null) ? dir.getName() : resourceName;
         try {
             if (type == POIFSDocumentType.OLE10_NATIVE) {
                 try {
                     // Try to un-wrap the OLE10Native record:
                     Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                     if (ole.getLabel() != null) {
-                        metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+                        metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
                     }
                     byte[] data = ole.getDataBuffer();
                     embedded = TikaInputStream.get(data);
                 } catch (Ole10NativeException ex) {
                     // Not a valid OLE10Native record, skip it
                 } catch (Exception e) {
-                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + rName, e);
                 }
             } else if (type == POIFSDocumentType.COMP_OBJ) {
                 try {
@@ -219,13 +228,13 @@ abstract class AbstractPOIFSExtractor {
 
                     // Record what we can do about it
                     metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
                 } catch (Exception e) {
                     throw new TikaException("Invalid embedded resource", e);
                 }
             } else {
                 metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
-                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
             }
 
             // Should we parse it?

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 656fdbb..1b34f03 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.HashSet;
 import java.util.List;
 
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.common.usermodel.Hyperlink;
 import org.apache.poi.hslf.model.Comment;
 import org.apache.poi.hslf.model.HeadersFooters;
@@ -40,6 +41,8 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -369,10 +372,19 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                         String mediaType = null;
                         if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                             mediaType = "application/vnd.ms-excel";
+                        } else {
+                            MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
+                            mediaType = mt.toString();
+                        }
+                        if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+                            try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
+                                handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+                            }
+                        } else {
+                            handleEmbeddedResource(
+                                    stream, objID, objID,
+                                    mediaType, xhtml, false);
                         }
-                        handleEmbeddedResource(
-                                stream, objID, objID,
-                                mediaType, xhtml, false);
                     }
                 }
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 84e9752..cd1919d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -229,8 +229,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
             if (root.hasEntry("CONTENTS")
                     && root.hasEntry("\u0001Ole")
-                    && root.hasEntry("\u0001CompObj")
-                    && root.hasEntry("\u0003ObjInfo")) {
+                    && root.hasEntry("\u0001CompObj")) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
                 //TODO: original file paths can be stored underneath root
                 //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 3e98aa9..196ffa9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,13 +16,14 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-import java.io.InputStream;
-import java.util.Locale;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
@@ -402,4 +403,10 @@ public class ExcelParserTest extends TikaTest {
         //link on textbox
 //        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
     }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index e0eee56..32d462e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -16,11 +16,12 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import static org.junit.Assert.assertEquals;
-
 import java.io.InputStream;
+import java.util.List;
 import java.util.Locale;
 
+import static org.junit.Assert.assertEquals;
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -238,4 +239,13 @@ public class PowerPointParserTest extends TikaTest {
         XMLResult r = getXML("testPPT_comment.ppt");
         assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
     }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+        assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b442d07..5159ade 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,10 +16,6 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
@@ -29,9 +25,14 @@ import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.io.TikaInputStream;
@@ -1209,6 +1210,23 @@ public class OOXMLParserTest extends TikaTest {
         //link on textbox
         assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
     }
+
+    @Test
+    public void testEmbeddedPDFInPPTX() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.pptx");
+        Metadata pdfMetadata1 = metadataList.get(2);
+        assertEquals("application/pdf", pdfMetadata1.get(Metadata.CONTENT_TYPE));
+        Metadata pdfMetadata2 = metadataList.get(4);
+        assertEquals("application/pdf", pdfMetadata2.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testEmbeddedPDFInXLSX() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+        Metadata pdfMetadata = metadataList.get(2);
+        assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls differ

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx differ

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt differ

http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx differ

[3/5] tika git commit: TIKA-2024 extract original file name/path where possible, take 1

Posted by ta...@apache.org.

TIKA-2024 extract original file name/path where possible, take 1


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e62f2305
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e62f2305
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e62f2305

Branch: refs/heads/2.x
Commit: e62f2305783763aad0a2c587f96b162ae4be1c36
Parents: c84855f
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:35:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:35:27 2016 -0400

----------------------------------------------------------------------
 .../tika/metadata/TikaCoreProperties.java       |   7 ++
 .../parser/apple/AppleSingleFileParser.java     |   4 +-
 .../parser/microsoft/JackcessExtractor.java     |   4 +-
 .../tika/parser/microsoft/OfficeParser.java     |   2 +-
 .../tika/parser/microsoft/WordExtractor.java    |  22 +++-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   2 +
 .../microsoft/xml/AbstractXML2003Parser.java    |   6 +-
 .../tika/parser/microsoft/xml/WordMLParser.java |  53 +++++++-
 .../tika/parser/rtf/RTFObjDataParser.java       |   2 +
 .../parser/apple/AppleSingleFileParserTest.java |   3 +
 .../tika/parser/microsoft/WordParserTest.java   |  16 ++-
 .../parser/microsoft/xml/XML2003ParserTest.java |   7 +-
 .../apache/tika/parser/rtf/RTFParserTest.java   | 124 +++++++------------
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  22 ++--
 .../test-documents/testAppleSingleFile.pdf      | Bin 54926 -> 1893 bytes
 .../test-documents/testExcel_embeddedPDF.xls    | Bin 0 -> 38400 bytes
 .../test-documents/testExcel_embeddedPDF.xlsx   | Bin 0 -> 25602 bytes
 .../test-documents/testPPT_EmbeddedPDF.ppt      | Bin 0 -> 187392 bytes
 .../test-documents/testPPT_EmbeddedPDF.pptx     | Bin 0 -> 108637 bytes
 19 files changed, 170 insertions(+), 104 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 5052fbc..f4b97dd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -76,6 +76,13 @@ public interface TikaCoreProperties {
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX+"warn");
 
     /**
+     * Some file formats can store information about their original
+     * file name/location or about their attachment's original file name/location.
+     */
+    public static final Property ORIGINAL_RESOURCE_NAME =
+            Property.internalTextBag(TIKA_META_PREFIX+"origResourceName");
+
+    /**
      * This is currently used to identify Content-Type that may be
      * included within a document, such as in html documents
      * (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">)

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index 3f548ca..0f3c044 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -32,6 +32,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.EndianUtils;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -133,8 +134,7 @@ public class AppleSingleFileParser extends AbstractParser {
                 IOUtils.readFully(stream, buffer);
                 bytesRead += f.length;
                 String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
-                //TODO: figure out correct metadata key
-                //embeddedMetadata.set(TikaCoreProperties.IDENTIFIER, originalFileName);
+                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
             } else if (f.entryId != DATA_FORK) {
                 IOUtils.skipFully(stream, f.length);
                 bytesRead += f.length;

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 345dd24..fb8a2c2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -301,7 +301,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
                 break;
             case SIMPLE_PACKAGE:
                 OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
-
+                //TODO: find test file that has this kind of attachment
+                //and see if getFilePath or getLocalFilePath is meaningful
+                //for TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                 handleEmbeddedResource(
                         TikaInputStream.get(spc.getStream()),
                         spc.getFileName(),//filename

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b8deb99..f5f9f3e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -143,7 +143,7 @@ public class OfficeParser extends AbstractParser {
                 xhtml.element("p", publisherTextExtractor.getText());
                 break;
             case WORDDOCUMENT:
-                new WordExtractor(context).parse(root, xhtml);
+                new WordExtractor(context, metadata).parse(root, xhtml);
                 break;
             case POWERPOINT:
                 new HSLFExtractor(context).parse(root, xhtml);

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4c950fa..8d36115 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -34,6 +34,8 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.model.FieldsDocumentPart;
 import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.SavedByEntry;
+import org.apache.poi.hwpf.model.SavedByTable;
 import org.apache.poi.hwpf.model.StyleDescription;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Field;
@@ -50,6 +52,8 @@ import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -79,8 +83,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     private boolean curBold;
     private boolean curItalic;
 
-    public WordExtractor(ParseContext context) {
+    private final Metadata metadata;
+
+    public WordExtractor(ParseContext context, Metadata metadata) {
         super(context);
+        this.metadata = metadata;
     }
 
     private static int countParagraphs(Range... ranges) {
@@ -146,6 +153,9 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             parseWord6(root, xhtml);
             return;
         }
+
+        extractSavedByMetadata(document);
+
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
                 new org.apache.poi.hwpf.extractor.WordExtractor(document);
         HeaderStories headerFooter = new HeaderStories(document);
@@ -212,6 +222,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+    private void extractSavedByMetadata(HWPFDocument document) {
+        SavedByTable savedByTable = document.getSavedByTable();
+        if (savedByTable == null) {
+            return;
+        }
+        for (SavedByEntry sbe : savedByTable.getEntries()) {
+            metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+        }
+    }
+
     private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
                                     PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
             throws SAXException, IOException, TikaException {

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 17e629f..84e9752 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -232,6 +232,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                     && root.hasEntry("\u0001CompObj")
                     && root.hasEntry("\u0003ObjInfo")) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
+                //TODO: original file paths can be stored underneath root
+                //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                 stream = TikaInputStream.get(
                         fs.createDocumentInputStream("CONTENTS"));
                 if (embeddedExtractor.shouldParseEmbedded(metadata)) {

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index a12f25e..4630219 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.parser.microsoft.xml;
 
+import java.io.IOException;
+import java.io.InputStream;
+
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -37,14 +40,13 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-import java.io.IOException;
-import java.io.InputStream;
 
 public abstract class AbstractXML2003Parser extends AbstractParser {
 
     final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
     final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
     final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet";
+    final static String MS_VML_URN = "urn:schemas-microsoft-com:vml";
     final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml";
     final static Attributes EMPTY_ATTRS = new AttributesImpl();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 28b33e4..67d13a9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -31,6 +31,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.TeeContentHandler;
@@ -186,10 +187,12 @@ public class WordMLParser extends AbstractXML2003Parser {
     private class PictHandler extends DefaultHandler {
         final StringBuilder buffer = new StringBuilder();
         final ContentHandler handler;
+        byte[] rawBytes = null;
         EmbeddedDocumentExtractor embeddedDocumentExtractor;
         boolean inPict = false;
         boolean inBin = false;
         String pictName = null;
+        String pictSource = null;
         final Base64 base64 = new Base64();
 
         public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
@@ -210,6 +213,24 @@ public class WordMLParser extends AbstractXML2003Parser {
                         pictName = pictName.replaceFirst("wordml://", "");
                     }
                 }
+            } else if (MS_VML_URN.equals(uri)) {
+                if (localName.equals("imagedata")) {
+                    //src is an internal designator with an extension
+                    String src = attrs.getValue("", "src");
+                    //title appears to be the original file name
+                    String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
+                    if (title != null && ! title.equals("")) {
+                        if (src != null) {
+                            //take the extention from the src and append it to the title
+                            int i = src.lastIndexOf(".");
+                            if (i > -1 && i +1 < src.length()) {
+                                String ext = src.substring(i);
+                                title += ext;
+                            }
+                        }
+                        pictSource = title;
+                    }
+                }
             }
         }
 
@@ -227,6 +248,13 @@ public class WordMLParser extends AbstractXML2003Parser {
             if (!WORD_ML_URL.equals(uri)) {
                 return;
             }
+            //somewhat tricky...
+            //can't just dump bin_data at the end of the
+            //bin_data element because there may be metadata
+            //after it, if it is within a pict element
+            //<pict><binData></binData><imagedata/></pict>.
+            //However, if you aren't in a pict (say docOLEdata), then do dump binary
+            //data at the end of the bin data.
             if (PICT.equals(localName)) {
                 inPict = false;
                 AttributesImpl attrs = new AttributesImpl();
@@ -238,17 +266,29 @@ public class WordMLParser extends AbstractXML2003Parser {
                         IMG, IMG, attrs);
                 handler.endElement(
                         XHTMLContentHandler.XHTML, IMG, IMG);
+                handleEmbedded();
             } else if (BIN_DATA.equals(localName)) {
                 inBin = false;
-                byte[] bytes = base64.decode(buffer.toString());
-                if (bytes == null) {
-                    return;
+                rawBytes = base64.decode(buffer.toString());
+                //reset
+                buffer.setLength(0);
+
+                if (! inPict) {
+                    handleEmbedded();
                 }
-                try (TikaInputStream is = TikaInputStream.get(bytes)) {
+            }
+        }
+
+        private void handleEmbedded() throws SAXException {
+            if (rawBytes != null) {
+                try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
                     Metadata metadata = new Metadata();
                     if (pictName != null) {
                         metadata.set(Metadata.RESOURCE_NAME_KEY, pictName);
                     }
+                    if (pictSource != null) {
+                        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
+                    }
                     if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
                         embeddedDocumentExtractor.parseEmbedded(is,
                                 handler, metadata, false);
@@ -256,8 +296,11 @@ public class WordMLParser extends AbstractXML2003Parser {
                 } catch (IOException e) {
                     //log
                 }
-                buffer.setLength(0);
             }
+            //reset
+            pictName = null;
+            pictSource = null;
+            rawBytes = null;
         }
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 147d2e8..6426687 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -40,6 +40,7 @@ import org.apache.tika.io.EndianUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 
 /**
@@ -242,6 +243,7 @@ class RTFObjDataParser {
             fileNameToUse = displayName == null ? "" : displayName;
             pathToUse = ansiFilePath == null ? "" : ansiFilePath;
         }
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse);
         metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
         metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
index c80c94a..bd8156d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
@@ -25,6 +25,7 @@ import java.util.List;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.junit.Test;
 
 public class AppleSingleFileParserTest extends TikaTest {
@@ -36,5 +37,7 @@ public class AppleSingleFileParserTest extends TikaTest {
         assertContains(AppleSingleFileParser.class.getName(),
                 Arrays.asList(list.get(0).getValues("X-Parsed-By")));
         assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("fltsyllabussortie2rev1_2.pdf", list.get(1).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 138120e..9d9d372 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -16,13 +16,15 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-import java.io.InputStream;
-import java.util.Locale;
-
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
@@ -492,5 +494,13 @@ public class WordParserTest extends TikaTest {
         assertEquals("manager1", managers[0]);
         assertEquals("manager2", managers[1]);
     }
+
+    @Test
+    public void testOrigLocation() throws Exception {
+        Metadata metadata = getXML("testException2.doc").metadata;
+        List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+        assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
+        assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 04530ce..510cd32 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -17,7 +17,6 @@
 package org.apache.tika.parser.microsoft.xml;
 
 import org.apache.tika.TikaTest;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -26,11 +25,11 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.junit.Test;
 
+import static org.junit.Assert.assertEquals;
+
 import java.util.Arrays;
 import java.util.List;
 
-import static org.junit.Assert.assertEquals;
-
 public class XML2003ParserTest extends TikaTest {
 
     @Test
@@ -80,6 +79,8 @@ public class XML2003ParserTest extends TikaTest {
         assertContains("R1 c1 R1 c2", txt);
         assertNotContained("footnoteFigure", txt);
         assertContains("footnote Figure", txt);
+
+        assertEquals("testJPEG_EXIF.jpg", list.get(7).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
         assertEquals("image/jpeg", list.get(7).get(Metadata.CONTENT_TYPE));
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index dc75be5..d80842b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -18,13 +18,13 @@ package org.apache.tika.parser.rtf;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.FilenameUtils;
@@ -381,83 +381,45 @@ public class RTFParserTest extends TikaTest {
     // TIKA-1010
     @Test
     public void testEmbeddedMonster() throws Exception {
-        Set<MediaType> skipTypes = new HashSet<MediaType>();
-        skipTypes.add(MediaType.parse("application/x-emf"));
-        skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
-
-        List<String> trueNames = new ArrayList<String>();
-        trueNames.add("file_0.doc");
-        trueNames.add("Hw.txt");
-        trueNames.add("file_1.xlsx");
-        trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip");
-        trueNames.add("html-within-zip.zip");
-        trueNames.add("text.html");
-        trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html");
-        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-        trueNames.add("file_2.xls");
-        trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg");
-        trueNames.add("file_3.pdf");
-        trueNames.add("file_4.ppt");
-        trueNames.add("file_5.pptx");
-        trueNames.add("thumbnail.jpeg");
-        trueNames.add("file_6.doc");
-        trueNames.add("file_7.doc");
-        trueNames.add("file_8.docx");
-        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
-        List<String> trueTypes = new ArrayList<String>();
-        trueTypes.add("application/msword");
-        trueTypes.add("text/plain");
-        trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-        trueTypes.add("application/zip");
-        trueTypes.add("application/zip");
-        trueTypes.add("text/html");
-        trueTypes.add("text/html");
-        trueTypes.add("image/jpeg");
-        trueTypes.add("application/vnd.ms-excel");
-        trueTypes.add("application/vnd.ms-outlook");
-        trueTypes.add("application/pdf");
-        trueTypes.add("application/vnd.ms-powerpoint");
-        trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
-        trueTypes.add("image/jpeg");
-        trueTypes.add("application/msword");
-        trueTypes.add("application/msword");
-        trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-        trueTypes.add("image/jpeg");
-
-        TrackingHandler tracker = new TrackingHandler(skipTypes);
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
 
-        assertEquals(trueNames.size(), tracker.filenames.size());
-        assertEquals(trueTypes.size(), tracker.mediaTypes.size());
-        for (int i = 0; i < tracker.filenames.size(); i++) {
-            String expectedName = trueNames.get(i);
-            if (expectedName == null) {
-                assertNull(tracker.filenames.get(i));
-            } else {
-                assertNotNull(tracker.filenames.get(i));
-                //necessary to getName() because MSOffice extractor includes
-                //directory: _1457338524/HW.txt
-                assertEquals("filename equals ",
-                        expectedName, FilenameUtils.getName(tracker.filenames.get(i)));
-            }
-            assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString());
-        }
-
-        tracker = new TrackingHandler();
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
+        Map<Integer, Pair> expected = new HashMap<>();
+        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+        expected.put(3, new Pair("file_0.doc", "application/msword"));
+        expected.put(6, new Pair("file_1.xlsx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+        expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+        expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+        expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+        expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+        expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+        expected.put(36, new Pair("file_6.doc", "application/msword"));
+        expected.put(39, new Pair("file_7.doc", "application/msword"));
+        expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+        List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+        assertEquals(48, metadataList.size());
+        for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+            Metadata metadata = metadataList.get(e.getKey());
+            Pair p = e.getValue();
+            assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+            //necessary to getName() because MSOffice extractor includes
+            //directory: _1457338524/HW.txt
+            assertEquals("filename equals ",
+                    p.fileName, FilenameUtils.getName(
+                            metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+            assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
         }
-        assertEquals(47, tracker.filenames.size());
-        assertEquals("thumbnail_26.emf", tracker.filenames.get(45));
-        assertEquals("thumbnail_27.wmf", tracker.filenames.get(46));
+        assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+                metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
     }
 
     //TIKA-1010 test regular (not "embedded") images/picts
@@ -537,4 +499,12 @@ public class RTFParserTest extends TikaTest {
         assertEquals(2, tracker.filenames.size());
     }
 
+    private static class Pair {
+        final String fileName;
+        final String mimeType;
+        Pair(String fileName, String mimeType) {
+            this.fileName = fileName;
+            this.mimeType = mimeType;
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 9a73bde..832b06e 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,6 +33,8 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.TreeMap;
 
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
 import javax.xml.stream.XMLStreamException;
 import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.commons.io.IOUtils;
@@ -176,7 +178,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
-    private void extractMultiOSPDEmbeddedFiles(String defaultName,
+    private void extractMultiOSPDEmbeddedFiles(String displayName,
                                        PDComplexFileSpecification spec,
                                        EmbeddedDocumentExtractor extractor) throws IOException,
             SAXException, TikaException {
@@ -185,13 +187,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             return;
         }
         //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
     }
 
-    private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+    private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
+                                       String fileName, PDEmbeddedFile file,
                                        EmbeddedDocumentExtractor extractor)
             throws SAXException, IOException, TikaException {
 
@@ -199,8 +202,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             //skip silently
             return;
         }
-
-        fileName = (fileName == null) ? defaultName : fileName;
+        
+        fileName = (fileName == null) ? displayName : fileName;
 
         // TODO: other metadata?
         Metadata metadata = new Metadata();
@@ -209,6 +212,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
 
         if (extractor.shouldParseEmbedded(metadata)) {
             TikaInputStream stream = null;
@@ -289,7 +293,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                     PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                     try {
-                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+                        extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
                     } catch (SAXException e) {
                         throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                     } catch (TikaException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
index a385313..a407ded 100644
Binary files a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf and b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx differ

[2/5] tika git commit: TIKA-2022 - clean up -- make entries private, move more into EndianUtils

Posted by ta...@apache.org.

TIKA-2022 - clean up -- make entries private, move more into EndianUtils


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c84855f6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c84855f6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c84855f6

Branch: refs/heads/2.x
Commit: c84855f6757c714a9fdcec55ca14b628a107642e
Parents: 865c45c
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:13:01 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:13:01 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/io/EndianUtils.java    | 19 +++++++++++
 .../org/apache/tika/io/EndianUtilsTest.java     | 16 +++++++++
 .../parser/apple/AppleSingleFileParser.java     | 35 ++++++++++----------
 3 files changed, 52 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/c84855f6/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 2ab85b3..05da5e0 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -93,6 +93,25 @@ public class EndianUtils {
     }
 
     /**
+     * Get a BE unsigned int value from an InputStream
+     *
+     * @param stream the InputStream from which the int is to be read
+     * @return the int (32-bit) value
+     * @throws IOException             will be propagated back to the caller
+     * @throws BufferUnderrunException if the stream cannot provide enough bytes
+     */
+    public static long readUIntBE(InputStream stream) throws IOException, BufferUnderrunException {
+        int ch1 = stream.read();
+        int ch2 = stream.read();
+        int ch3 = stream.read();
+        int ch4 = stream.read();
+        if ((ch1 | ch2 | ch3 | ch4) < 0) {
+            throw new BufferUnderrunException();
+        }
+        return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)) & 0x00FFFFFFFFl;
+    }
+
+    /**
      * Get a LE int value from an InputStream
      *
      * @param stream the InputStream from which the int is to be read

http://git-wip-us.apache.org/repos/asf/tika/blob/c84855f6/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
index 8f481c3..50084d2 100644
--- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
@@ -54,6 +54,22 @@ public class EndianUtilsTest {
         } catch (EndianUtils.BufferUnderrunException e) {
 
         }
+    }
+
+    @Test
+    public void testReadUIntBE() throws Exception {
+        byte[] data = new byte[] {(byte)0x00, (byte)0x00, (byte)0x00, (byte)0x08 };
+        assertEquals((long) 8, EndianUtils.readUIntBE(new ByteArrayInputStream(data)));
 
+        data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xF0 };
+        assertEquals(4294967280L, EndianUtils.readUIntBE(new ByteArrayInputStream(data)));
+
+        data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF  };
+        try {
+            EndianUtils.readUIntLE(new ByteArrayInputStream(data));
+            fail("Should have thrown exception");
+        } catch (EndianUtils.BufferUnderrunException e) {
+
+        }
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/c84855f6/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index ffb5759..3f548ca 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -50,21 +50,21 @@ public class AppleSingleFileParser extends AbstractParser {
     /**
      * Entry types
      */
-    public static final int DATA_FORK = 1;
-    public static final int RESOURCE_FORK = 2;
-    public static final int REAL_NAME = 3;
-    public static final int COMMENT = 4;
-    public static final int ICON_BW = 5;
-    public static final int ICON_COLOR = 6;
+    private static final int DATA_FORK = 1;
+    private static final int RESOURCE_FORK = 2;
+    private static final int REAL_NAME = 3;
+    private static final int COMMENT = 4;
+    private static final int ICON_BW = 5;
+    private static final int ICON_COLOR = 6;
     //7?!
-    public static final int FILE_DATES_INFO = 8;
-    public static final int FINDER_INFO = 9;
-    public static final int MACINTOSH_FILE_INFO = 10;
-    public static final int PRODOS_FILE_INFO = 11;
-    public static final int MSDOS_FILE_INFO = 12;
-    public static final int SHORT_NAME = 13;
-    public static final int AFP_FILE_INFO = 14;
-    public static final int DIRECTORY_ID = 15;
+    private static final int FILE_DATES_INFO = 8;
+    private static final int FINDER_INFO = 9;
+    private static final int MACINTOSH_FILE_INFO = 10;
+    private static final int PRODOS_FILE_INFO = 11;
+    private static final int MSDOS_FILE_INFO = 12;
+    private static final int SHORT_NAME = 13;
+    private static final int AFP_FILE_INFO = 14;
+    private static final int DIRECTORY_ID = 15;
 
     private static final Set<MediaType> SUPPORTED_TYPES =
             Collections.singleton(MediaType.application("applefile"));
@@ -94,7 +94,6 @@ public class AppleSingleFileParser extends AbstractParser {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         if (contentFieldInfo != null) {
-            System.out.println(contentFieldInfo.offset + " "+bytesRead);
             long diff = contentFieldInfo.offset-bytesRead;
             IOUtils.skipFully(stream, diff);
             if (ex.shouldParseEmbedded(embeddedMetadata)) {
@@ -153,9 +152,9 @@ public class AppleSingleFileParser extends AbstractParser {
             //convert 32-bit unsigned ints to longs
             fieldInfoList.add(
                     new FieldInfo(
-                            EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //entry id
-                            EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //offset
-                            EndianUtils.readIntBE(stream) & 0x00000000ffffffffL  //length
+                            EndianUtils.readUIntBE(stream), //entry id
+                            EndianUtils.readUIntBE(stream), //offset
+                            EndianUtils.readUIntBE(stream) //length
                     )
             );
         }