You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 00:54:49 UTC
[1/5] tika git commit: fix indentation
Repository: tika
Updated Branches:
refs/heads/2.x 5bc597dc8 -> dd3c2a486
fix indentation
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/865c45cd
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/865c45cd
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/865c45cd
Branch: refs/heads/2.x
Commit: 865c45cd569f680899cd2ede32987b1bf3f8a86e
Parents: 5bc597d
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:11:07 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:11:07 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/io/EndianUtils.java | 831 ++++++++++---------
1 file changed, 421 insertions(+), 410 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/865c45cd/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 3416f55..2ab85b3 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -25,418 +25,429 @@ import org.apache.tika.exception.TikaException;
* General Endian Related Utilties.
* <p>
* This class provides static utility methods for input/output operations
- * on numbers in Big and Little Endian formats.
+ * on numbers in Big and Little Endian formats.
* <p>
* Origin of code: Based on the version in POI
*/
public class EndianUtils {
- /**
- * Get a LE short value from an InputStream
- *
- * @param stream the InputStream from which the short is to be read
- * @return the short (16-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static short readShortLE(InputStream stream) throws IOException, BufferUnderrunException {
- return (short) readUShortLE(stream);
- }
- /**
- * Get a BE short value from an InputStream
- *
- * @param stream the InputStream from which the short is to be read
- * @return the short (16-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static short readShortBE(InputStream stream) throws IOException, BufferUnderrunException {
- return (short) readUShortBE(stream);
- }
-
- public static int readUShortLE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- if ((ch1 | ch2) < 0) {
- throw new BufferUnderrunException();
- }
- return (ch2 << 8) + (ch1 << 0);
- }
- public static int readUShortBE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- if ((ch1 | ch2) < 0) {
- throw new BufferUnderrunException();
- }
- return (ch1 << 8) + (ch2 << 0);
- }
-
-
- /**
- * Get a LE unsigned int value from an InputStream
- *
- * @param stream the InputStream from which the int is to be read
- * @return the int (32-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- int ch3 = stream.read();
- int ch4 = stream.read();
- if ((ch1 | ch2 | ch3 | ch4) < 0) {
- throw new BufferUnderrunException();
- }
- return ((ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0)) & 0x00FFFFFFFFl;
- }
-
- /**
- * Get a LE int value from an InputStream
- *
- * @param stream the InputStream from which the int is to be read
- * @return the int (32-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- int ch3 = stream.read();
- int ch4 = stream.read();
- if ((ch1 | ch2 | ch3 | ch4) < 0) {
- throw new BufferUnderrunException();
- }
- return (ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0);
- }
- /**
- * Get a BE int value from an InputStream
- *
- * @param stream the InputStream from which the int is to be read
- * @return the int (32-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- int ch3 = stream.read();
- int ch4 = stream.read();
- if ((ch1 | ch2 | ch3 | ch4) < 0) {
- throw new BufferUnderrunException();
- }
- return (ch1 << 24) + (ch2<<16) + (ch3 << 8) + (ch4 << 0);
- }
-
- /**
- * Get a LE long value from an InputStream
- *
- * @param stream the InputStream from which the long is to be read
- * @return the long (64-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- int ch3 = stream.read();
- int ch4 = stream.read();
- int ch5 = stream.read();
- int ch6 = stream.read();
- int ch7 = stream.read();
- int ch8 = stream.read();
- if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
- throw new BufferUnderrunException();
- }
-
- return
- ((long)ch8 << 56) +
- ((long)ch7 << 48) +
- ((long)ch6 << 40) +
- ((long)ch5 << 32) +
- ((long)ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
- (ch3 << 16) +
- (ch2 << 8) +
- (ch1 << 0);
- }
- /**
- * Get a NE long value from an InputStream
- *
- * @param stream the InputStream from which the long is to be read
- * @return the long (64-bit) value
- * @exception IOException will be propagated back to the caller
- * @exception BufferUnderrunException if the stream cannot provide enough bytes
- */
- public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException {
- int ch1 = stream.read();
- int ch2 = stream.read();
- int ch3 = stream.read();
- int ch4 = stream.read();
- int ch5 = stream.read();
- int ch6 = stream.read();
- int ch7 = stream.read();
- int ch8 = stream.read();
- if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
- throw new BufferUnderrunException();
- }
-
- return
- ((long)ch1 << 56) +
- ((long)ch2 << 48) +
- ((long)ch3 << 40) +
- ((long)ch4 << 32) +
- ((long)ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
- (ch6 << 16) +
- (ch7 << 8) +
- (ch8 << 0);
- }
-
- /**
- * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian
- * but with the high bit on each number indicating if it continues or not
- */
- public static long readUE7(InputStream stream) throws IOException {
- int i;
- long v = 0;
- while ((i = stream.read()) >= 0) {
- v = v << 7;
- if ((i & 128) == 128) {
- // Continues
- v += (i&127);
- } else {
- // Last value
- v += i;
- break;
- }
- }
- return v;
- }
-
-
- /**
- * Get a LE short value from the beginning of a byte array
- *
- *@param data the byte array
- *@return the short (16-bit) value
- */
- public static short getShortLE(byte[] data) {
- return getShortLE(data, 0);
- }
- /**
- * Get a LE short value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the short (16-bit) value
- */
- public static short getShortLE(byte[] data, int offset) {
- return (short)getUShortLE(data, offset);
- }
-
- /**
- * Get a LE unsigned short value from the beginning of a byte array
- *
- *@param data the byte array
- *@return the unsigned short (16-bit) value in an int
- */
- public static int getUShortLE(byte[] data) {
- return getUShortLE(data, 0);
- }
- /**
- * Get a LE unsigned short value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the unsigned short (16-bit) value in an integer
- */
- public static int getUShortLE(byte[] data, int offset) {
- int b0 = data[offset] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- return (b1 << 8) + (b0 << 0);
- }
-
- /**
- * Get a BE short value from the beginning of a byte array
- *
- *@param data the byte array
- *@return the short (16-bit) value
- */
- public static short getShortBE(byte[] data) {
- return getShortBE(data, 0);
- }
- /**
- * Get a BE short value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the short (16-bit) value
- */
- public static short getShortBE(byte[] data, int offset) {
- return (short)getUShortBE(data, offset);
- }
-
- /**
- * Get a BE unsigned short value from the beginning of a byte array
- *
- *@param data the byte array
- *@return the unsigned short (16-bit) value in an int
- */
- public static int getUShortBE(byte[] data) {
- return getUShortBE(data, 0);
- }
- /**
- * Get a BE unsigned short value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the unsigned short (16-bit) value in an integer
- */
- public static int getUShortBE(byte[] data, int offset) {
- int b0 = data[offset] & 0xFF;
- int b1 = data[offset+1] & 0xFF;
- return (b0 << 8) + (b1 << 0);
- }
-
- /**
- * Get a LE int value from the beginning of a byte array
- *
- *@param data the byte array
- *@return the int (32-bit) value
- */
- public static int getIntLE(byte[] data) {
- return getIntLE(data, 0);
- }
- /**
- * Get a LE int value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the int (32-bit) value
- */
- public static int getIntLE(byte[] data, int offset) {
- int i=offset;
- int b0 = data[i++] & 0xFF;
- int b1 = data[i++] & 0xFF;
- int b2 = data[i++] & 0xFF;
- int b3 = data[i++] & 0xFF;
- return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0);
- }
-
- /**
- * Get a BE int value from the beginning of a byte array
- *
- *@param data the byte array
- *@return the int (32-bit) value
- */
- public static int getIntBE(byte[] data) {
- return getIntBE(data, 0);
- }
- /**
- * Get a BE int value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the int (32-bit) value
- */
- public static int getIntBE(byte[] data, int offset) {
- int i=offset;
- int b0 = data[i++] & 0xFF;
- int b1 = data[i++] & 0xFF;
- int b2 = data[i++] & 0xFF;
- int b3 = data[i++] & 0xFF;
- return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
- }
-
- /**
- * Get a LE unsigned int value from a byte array
- *
- *@param data the byte array
- *@return the unsigned int (32-bit) value in a long
- */
- public static long getUIntLE(byte[] data) {
- return getUIntLE(data,0);
- }
- /**
- * Get a LE unsigned int value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the unsigned int (32-bit) value in a long
- */
- public static long getUIntLE(byte[] data, int offset) {
- long retNum = getIntLE(data, offset);
- return retNum & 0x00FFFFFFFFl;
- }
-
- /**
- * Get a BE unsigned int value from a byte array
- *
- *@param data the byte array
- *@return the unsigned int (32-bit) value in a long
- */
- public static long getUIntBE(byte[] data) {
- return getUIntBE(data,0);
- }
- /**
- * Get a BE unsigned int value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the unsigned int (32-bit) value in a long
- */
- public static long getUIntBE(byte[] data, int offset) {
- long retNum = getIntBE(data, offset);
- return retNum & 0x00FFFFFFFFl;
- }
-
- /**
- * Get a LE long value from a byte array
- *
- *@param data the byte array
- *@param offset a starting offset into the byte array
- *@return the long (64-bit) value
- */
- public static long getLongLE(byte[] data, int offset) {
- long result = 0;
-
- for (int j = offset + LONG_SIZE - 1; j >= offset; j--) {
- result <<= 8;
- result |= 0xff & data[j];
- }
- return result;
- }
- private static final int LONG_SIZE = 8;
-
-
- /**
- * Convert an 'unsigned' byte to an integer. ie, don't carry across the
- * sign.
- *
- * @param b Description of the Parameter
- * @return Description of the Return Value
- */
- public static int ubyteToInt(byte b) {
- return b & 0xFF;
- }
-
- /**
- * get the unsigned value of a byte.
- *
- * @param data
- * the byte array.
- * @param offset
- * a starting offset into the byte array.
- * @return the unsigned value of the byte as a 16 bit short
- */
- public static short getUByte( byte[] data, int offset )
- {
- return (short) ( data[offset] & 0xFF );
- }
-
-
- public static class BufferUnderrunException extends TikaException {
- private static final long serialVersionUID = 8358288231138076276L;
- public BufferUnderrunException() {
- super("Insufficient data left in stream for required read");
- }
- }
+ /**
+ * Get a LE short value from an InputStream
+ *
+ * @param stream the InputStream from which the short is to be read
+ * @return the short (16-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static short readShortLE(InputStream stream) throws IOException, BufferUnderrunException {
+ return (short) readUShortLE(stream);
+ }
+
+ /**
+ * Get a BE short value from an InputStream
+ *
+ * @param stream the InputStream from which the short is to be read
+ * @return the short (16-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static short readShortBE(InputStream stream) throws IOException, BufferUnderrunException {
+ return (short) readUShortBE(stream);
+ }
+
+ public static int readUShortLE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ if ((ch1 | ch2) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return (ch2 << 8) + (ch1 << 0);
+ }
+
+ public static int readUShortBE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ if ((ch1 | ch2) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return (ch1 << 8) + (ch2 << 0);
+ }
+
+
+ /**
+ * Get a LE unsigned int value from an InputStream
+ *
+ * @param stream the InputStream from which the int is to be read
+ * @return the int (32-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0)) & 0x00FFFFFFFFl;
+ }
+
+ /**
+ * Get a LE int value from an InputStream
+ *
+ * @param stream the InputStream from which the int is to be read
+ * @return the int (32-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return (ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0);
+ }
+
+ /**
+ * Get a BE int value from an InputStream
+ *
+ * @param stream the InputStream from which the int is to be read
+ * @return the int (32-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return (ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0);
+ }
+
+ /**
+ * Get a LE long value from an InputStream
+ *
+ * @param stream the InputStream from which the long is to be read
+ * @return the long (64-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ int ch5 = stream.read();
+ int ch6 = stream.read();
+ int ch7 = stream.read();
+ int ch8 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
+ throw new BufferUnderrunException();
+ }
+
+ return
+ ((long) ch8 << 56) +
+ ((long) ch7 << 48) +
+ ((long) ch6 << 40) +
+ ((long) ch5 << 32) +
+ ((long) ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
+ (ch3 << 16) +
+ (ch2 << 8) +
+ (ch1 << 0);
+ }
+
+ /**
+ * Get a NE long value from an InputStream
+ *
+ * @param stream the InputStream from which the long is to be read
+ * @return the long (64-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ int ch5 = stream.read();
+ int ch6 = stream.read();
+ int ch7 = stream.read();
+ int ch8 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4 | ch5 | ch6 | ch7 | ch8) < 0) {
+ throw new BufferUnderrunException();
+ }
+
+ return
+ ((long) ch1 << 56) +
+ ((long) ch2 << 48) +
+ ((long) ch3 << 40) +
+ ((long) ch4 << 32) +
+ ((long) ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
+ (ch6 << 16) +
+ (ch7 << 8) +
+ (ch8 << 0);
+ }
+
+ /**
+ * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian
+ * but with the high bit on each number indicating if it continues or not
+ */
+ public static long readUE7(InputStream stream) throws IOException {
+ int i;
+ long v = 0;
+ while ((i = stream.read()) >= 0) {
+ v = v << 7;
+ if ((i & 128) == 128) {
+ // Continues
+ v += (i & 127);
+ } else {
+ // Last value
+ v += i;
+ break;
+ }
+ }
+ return v;
+ }
+
+
+ /**
+ * Get a LE short value from the beginning of a byte array
+ *
+ * @param data the byte array
+ * @return the short (16-bit) value
+ */
+ public static short getShortLE(byte[] data) {
+ return getShortLE(data, 0);
+ }
+
+ /**
+ * Get a LE short value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the short (16-bit) value
+ */
+ public static short getShortLE(byte[] data, int offset) {
+ return (short) getUShortLE(data, offset);
+ }
+
+ /**
+ * Get a LE unsigned short value from the beginning of a byte array
+ *
+ * @param data the byte array
+ * @return the unsigned short (16-bit) value in an int
+ */
+ public static int getUShortLE(byte[] data) {
+ return getUShortLE(data, 0);
+ }
+
+ /**
+ * Get a LE unsigned short value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the unsigned short (16-bit) value in an integer
+ */
+ public static int getUShortLE(byte[] data, int offset) {
+ int b0 = data[offset] & 0xFF;
+ int b1 = data[offset + 1] & 0xFF;
+ return (b1 << 8) + (b0 << 0);
+ }
+
+ /**
+ * Get a BE short value from the beginning of a byte array
+ *
+ * @param data the byte array
+ * @return the short (16-bit) value
+ */
+ public static short getShortBE(byte[] data) {
+ return getShortBE(data, 0);
+ }
+
+ /**
+ * Get a BE short value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the short (16-bit) value
+ */
+ public static short getShortBE(byte[] data, int offset) {
+ return (short) getUShortBE(data, offset);
+ }
+
+ /**
+ * Get a BE unsigned short value from the beginning of a byte array
+ *
+ * @param data the byte array
+ * @return the unsigned short (16-bit) value in an int
+ */
+ public static int getUShortBE(byte[] data) {
+ return getUShortBE(data, 0);
+ }
+
+ /**
+ * Get a BE unsigned short value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the unsigned short (16-bit) value in an integer
+ */
+ public static int getUShortBE(byte[] data, int offset) {
+ int b0 = data[offset] & 0xFF;
+ int b1 = data[offset + 1] & 0xFF;
+ return (b0 << 8) + (b1 << 0);
+ }
+
+ /**
+ * Get a LE int value from the beginning of a byte array
+ *
+ * @param data the byte array
+ * @return the int (32-bit) value
+ */
+ public static int getIntLE(byte[] data) {
+ return getIntLE(data, 0);
+ }
+
+ /**
+ * Get a LE int value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the int (32-bit) value
+ */
+ public static int getIntLE(byte[] data, int offset) {
+ int i = offset;
+ int b0 = data[i++] & 0xFF;
+ int b1 = data[i++] & 0xFF;
+ int b2 = data[i++] & 0xFF;
+ int b3 = data[i++] & 0xFF;
+ return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0);
+ }
+
+ /**
+ * Get a BE int value from the beginning of a byte array
+ *
+ * @param data the byte array
+ * @return the int (32-bit) value
+ */
+ public static int getIntBE(byte[] data) {
+ return getIntBE(data, 0);
+ }
+
+ /**
+ * Get a BE int value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the int (32-bit) value
+ */
+ public static int getIntBE(byte[] data, int offset) {
+ int i = offset;
+ int b0 = data[i++] & 0xFF;
+ int b1 = data[i++] & 0xFF;
+ int b2 = data[i++] & 0xFF;
+ int b3 = data[i++] & 0xFF;
+ return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+ }
+
+ /**
+ * Get a LE unsigned int value from a byte array
+ *
+ * @param data the byte array
+ * @return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntLE(byte[] data) {
+ return getUIntLE(data, 0);
+ }
+
+ /**
+ * Get a LE unsigned int value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntLE(byte[] data, int offset) {
+ long retNum = getIntLE(data, offset);
+ return retNum & 0x00FFFFFFFFl;
+ }
+
+ /**
+ * Get a BE unsigned int value from a byte array
+ *
+ * @param data the byte array
+ * @return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntBE(byte[] data) {
+ return getUIntBE(data, 0);
+ }
+
+ /**
+ * Get a BE unsigned int value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntBE(byte[] data, int offset) {
+ long retNum = getIntBE(data, offset);
+ return retNum & 0x00FFFFFFFFl;
+ }
+
+ /**
+ * Get a LE long value from a byte array
+ *
+ * @param data the byte array
+ * @param offset a starting offset into the byte array
+ * @return the long (64-bit) value
+ */
+ public static long getLongLE(byte[] data, int offset) {
+ long result = 0;
+
+ for (int j = offset + LONG_SIZE - 1; j >= offset; j--) {
+ result <<= 8;
+ result |= 0xff & data[j];
+ }
+ return result;
+ }
+
+ private static final int LONG_SIZE = 8;
+
+
+ /**
+ * Convert an 'unsigned' byte to an integer. ie, don't carry across the
+ * sign.
+ *
+ * @param b Description of the Parameter
+ * @return Description of the Return Value
+ */
+ public static int ubyteToInt(byte b) {
+ return b & 0xFF;
+ }
+
+ /**
+ * get the unsigned value of a byte.
+ *
+ * @param data the byte array.
+ * @param offset a starting offset into the byte array.
+ * @return the unsigned value of the byte as a 16 bit short
+ */
+ public static short getUByte(byte[] data, int offset) {
+ return (short) (data[offset] & 0xFF);
+ }
+
+
+ public static class BufferUnderrunException extends TikaException {
+ private static final long serialVersionUID = 8358288231138076276L;
+
+ public BufferUnderrunException() {
+ super("Insufficient data left in stream for required read");
+ }
+ }
}
[4/5] tika git commit: rm inconsistently capitalized test files
Posted by ta...@apache.org.
rm inconsistently capitalized test files
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/933af20e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/933af20e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/933af20e
Branch: refs/heads/2.x
Commit: 933af20e84cf088d80348b49eef7cbe4732a9eb3
Parents: e62f230
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:37:05 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:37:05 2016 -0400
----------------------------------------------------------------------
.../test-documents/testExcel_embeddedPDF.xls | Bin 38400 -> 0 bytes
.../test-documents/testExcel_embeddedPDF.xlsx | Bin 25602 -> 0 bytes
.../test-documents/testPPT_EmbeddedPDF.ppt | Bin 187392 -> 0 bytes
.../test-documents/testPPT_EmbeddedPDF.pptx | Bin 108637 -> 0 bytes
4 files changed, 0 insertions(+), 0 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
deleted file mode 100644
index c38f64c..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
deleted file mode 100644
index 9c0d2b9..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
deleted file mode 100644
index 3129be1..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/933af20e/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
deleted file mode 100644
index a96aa3c..0000000
Binary files a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx and /dev/null differ
[5/5] tika git commit: TIKA-2026 -- improve extraction of attachments
for PPT, PPTX, XLSX
Posted by ta...@apache.org.
TIKA-2026 -- improve extraction of attachments for PPT, PPTX, XLSX
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dd3c2a48
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dd3c2a48
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dd3c2a48
Branch: refs/heads/2.x
Commit: dd3c2a486a41903d5ebeb4bf341be29e02af8499
Parents: 933af20
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:54:40 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:54:40 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 5 +++-
.../microsoft/AbstractPOIFSExtractor.java | 19 ++++++++++----
.../tika/parser/microsoft/HSLFExtractor.java | 18 ++++++++++---
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 3 +--
.../tika/parser/microsoft/ExcelParserTest.java | 13 +++++++---
.../parser/microsoft/PowerPointParserTest.java | 14 ++++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 26 ++++++++++++++++---
.../test-documents/testEXCEL_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testEXCEL_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../test-documents/testPPT_embeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_embeddedPDF.pptx | Bin 0 -> 108637 bytes
11 files changed, 78 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 766780f..64e1f53 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,7 +17,10 @@ Release 2.0 - ???
Release 1.14 - ???
- * Add parser for applefile (AppleSingle) (TIKA-2022)
+ * Improve extraction of embedded documents for PPT, PPTX and XLSX
+ (TIKA-2026).
+
+ * Add parser for applefile (AppleSingle) (TIKA-2022).
* Add mime types, mime magic and/or globs for:
* Endnote Import File (TIKA-2011)
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 1225288..739af69 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -152,6 +152,15 @@ abstract class AbstractPOIFSExtractor {
protected void handleEmbeddedOfficeDoc(
DirectoryEntry dir, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ handleEmbeddedOfficeDoc(dir, null, xhtml);
+ }
+
+ /**
+ * Handle an office document that's embedded at the POIFS level
+ */
+ protected void handleEmbeddedOfficeDoc(
+ DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
// Is it an embedded OLE2 document, or an embedded OOXML document?
@@ -177,21 +186,21 @@ abstract class AbstractPOIFSExtractor {
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
-
+ String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
- logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+ logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + rName, e);
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
@@ -219,13 +228,13 @@ abstract class AbstractPOIFSExtractor {
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
throw new TikaException("Invalid embedded resource", e);
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
}
// Should we parse it?
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 656fdbb..1b34f03 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.List;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.common.usermodel.Hyperlink;
import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HeadersFooters;
@@ -40,6 +41,8 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -369,10 +372,19 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
+ } else {
+ MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
+ mediaType = mt.toString();
+ }
+ if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
+ try(NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
+ handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
+ }
+ } else {
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
}
- handleEmbeddedResource(
- stream, objID, objID,
- mediaType, xhtml, false);
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 84e9752..cd1919d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -229,8 +229,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (root.hasEntry("CONTENTS")
&& root.hasEntry("\u0001Ole")
- && root.hasEntry("\u0001CompObj")
- && root.hasEntry("\u0003ObjInfo")) {
+ && root.hasEntry("\u0001CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: original file paths can be stored underneath root
//figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 3e98aa9..196ffa9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-import java.io.InputStream;
-import java.util.Locale;
-
import org.apache.tika.TikaTest;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -402,4 +403,10 @@ public class ExcelParserTest extends TikaTest {
//link on textbox
// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index e0eee56..32d462e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.junit.Assert.assertEquals;
-
import java.io.InputStream;
+import java.util.List;
import java.util.Locale;
+import static org.junit.Assert.assertEquals;
+
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -238,4 +239,13 @@ public class PowerPointParserTest extends TikaTest {
XMLResult r = getXML("testPPT_comment.ppt");
assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
}
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+ assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b442d07..5159ade 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,10 +16,6 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
@@ -29,9 +25,14 @@ import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
import java.util.HashMap;
+import java.util.List;
import java.util.Locale;
import java.util.Map;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
@@ -1209,6 +1210,23 @@ public class OOXMLParserTest extends TikaTest {
//link on textbox
assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
+
+ @Test
+ public void testEmbeddedPDFInPPTX() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.pptx");
+ Metadata pdfMetadata1 = metadataList.get(2);
+ assertEquals("application/pdf", pdfMetadata1.get(Metadata.CONTENT_TYPE));
+ Metadata pdfMetadata2 = metadataList.get(4);
+ assertEquals("application/pdf", pdfMetadata2.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testEmbeddedPDFInXLSX() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ Metadata pdfMetadata = metadataList.get(2);
+ assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testEXCEL_embeddedPDF.xlsx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.ppt differ
http://git-wip-us.apache.org/repos/asf/tika/blob/dd3c2a48/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_embeddedPDF.pptx differ
[3/5] tika git commit: TIKA-2024 extract original file name/path
where possible, take 1
Posted by ta...@apache.org.
TIKA-2024 extract original file name/path where possible, take 1
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e62f2305
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e62f2305
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e62f2305
Branch: refs/heads/2.x
Commit: e62f2305783763aad0a2c587f96b162ae4be1c36
Parents: c84855f
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:35:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:35:27 2016 -0400
----------------------------------------------------------------------
.../tika/metadata/TikaCoreProperties.java | 7 ++
.../parser/apple/AppleSingleFileParser.java | 4 +-
.../parser/microsoft/JackcessExtractor.java | 4 +-
.../tika/parser/microsoft/OfficeParser.java | 2 +-
.../tika/parser/microsoft/WordExtractor.java | 22 +++-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 +
.../microsoft/xml/AbstractXML2003Parser.java | 6 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 53 +++++++-
.../tika/parser/rtf/RTFObjDataParser.java | 2 +
.../parser/apple/AppleSingleFileParserTest.java | 3 +
.../tika/parser/microsoft/WordParserTest.java | 16 ++-
.../parser/microsoft/xml/XML2003ParserTest.java | 7 +-
.../apache/tika/parser/rtf/RTFParserTest.java | 124 +++++++------------
.../tika/parser/pdf/AbstractPDF2XHTML.java | 22 ++--
.../test-documents/testAppleSingleFile.pdf | Bin 54926 -> 1893 bytes
.../test-documents/testExcel_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testExcel_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../test-documents/testPPT_EmbeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_EmbeddedPDF.pptx | Bin 0 -> 108637 bytes
19 files changed, 170 insertions(+), 104 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 5052fbc..f4b97dd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -76,6 +76,13 @@ public interface TikaCoreProperties {
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX+"warn");
/**
+ * Some file formats can store information about their original
+ * file name/location or about their attachment's original file name/location.
+ */
+ public static final Property ORIGINAL_RESOURCE_NAME =
+ Property.internalTextBag(TIKA_META_PREFIX+"origResourceName");
+
+ /**
* This is currently used to identify Content-Type that may be
* included within a document, such as in html documents
* (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">)
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index 3f548ca..0f3c044 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -32,6 +32,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
@@ -133,8 +134,7 @@ public class AppleSingleFileParser extends AbstractParser {
IOUtils.readFully(stream, buffer);
bytesRead += f.length;
String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
- //TODO: figure out correct metadata key
- //embeddedMetadata.set(TikaCoreProperties.IDENTIFIER, originalFileName);
+ embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
} else if (f.entryId != DATA_FORK) {
IOUtils.skipFully(stream, f.length);
bytesRead += f.length;
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 345dd24..fb8a2c2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -301,7 +301,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
break;
case SIMPLE_PACKAGE:
OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
-
+ //TODO: find test file that has this kind of attachment
+ //and see if getFilePath or getLocalFilePath is meaningful
+ //for TikaCoreProperties.ORIGINAL_RESOURCE_NAME
handleEmbeddedResource(
TikaInputStream.get(spc.getStream()),
spc.getFileName(),//filename
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b8deb99..f5f9f3e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -143,7 +143,7 @@ public class OfficeParser extends AbstractParser {
xhtml.element("p", publisherTextExtractor.getText());
break;
case WORDDOCUMENT:
- new WordExtractor(context).parse(root, xhtml);
+ new WordExtractor(context, metadata).parse(root, xhtml);
break;
case POWERPOINT:
new HSLFExtractor(context).parse(root, xhtml);
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4c950fa..8d36115 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -34,6 +34,8 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.SavedByEntry;
+import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Field;
@@ -50,6 +52,8 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -79,8 +83,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
private boolean curBold;
private boolean curItalic;
- public WordExtractor(ParseContext context) {
+ private final Metadata metadata;
+
+ public WordExtractor(ParseContext context, Metadata metadata) {
super(context);
+ this.metadata = metadata;
}
private static int countParagraphs(Range... ranges) {
@@ -146,6 +153,9 @@ public class WordExtractor extends AbstractPOIFSExtractor {
parseWord6(root, xhtml);
return;
}
+
+ extractSavedByMetadata(document);
+
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
new org.apache.poi.hwpf.extractor.WordExtractor(document);
HeaderStories headerFooter = new HeaderStories(document);
@@ -212,6 +222,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
}
+ private void extractSavedByMetadata(HWPFDocument document) {
+ SavedByTable savedByTable = document.getSavedByTable();
+ if (savedByTable == null) {
+ return;
+ }
+ for (SavedByEntry sbe : savedByTable.getEntries()) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+ }
+ }
+
private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 17e629f..84e9752 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -232,6 +232,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
&& root.hasEntry("\u0001CompObj")
&& root.hasEntry("\u0003ObjInfo")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
+ //TODO: original file paths can be stored underneath root
+ //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(
fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index a12f25e..4630219 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -16,6 +16,9 @@
*/
package org.apache.tika.parser.microsoft.xml;
+import java.io.IOException;
+import java.io.InputStream;
+
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -37,14 +40,13 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import java.io.IOException;
-import java.io.InputStream;
public abstract class AbstractXML2003Parser extends AbstractParser {
final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet";
+ final static String MS_VML_URN = "urn:schemas-microsoft-com:vml";
final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml";
final static Attributes EMPTY_ATTRS = new AttributesImpl();
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 28b33e4..67d13a9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -31,6 +31,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
@@ -186,10 +187,12 @@ public class WordMLParser extends AbstractXML2003Parser {
private class PictHandler extends DefaultHandler {
final StringBuilder buffer = new StringBuilder();
final ContentHandler handler;
+ byte[] rawBytes = null;
EmbeddedDocumentExtractor embeddedDocumentExtractor;
boolean inPict = false;
boolean inBin = false;
String pictName = null;
+ String pictSource = null;
final Base64 base64 = new Base64();
public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
@@ -210,6 +213,24 @@ public class WordMLParser extends AbstractXML2003Parser {
pictName = pictName.replaceFirst("wordml://", "");
}
}
+ } else if (MS_VML_URN.equals(uri)) {
+ if (localName.equals("imagedata")) {
+ //src is an internal designator with an extension
+ String src = attrs.getValue("", "src");
+ //title appears to be the original file name
+ String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
+ if (title != null && ! title.equals("")) {
+ if (src != null) {
+ //take the extention from the src and append it to the title
+ int i = src.lastIndexOf(".");
+ if (i > -1 && i +1 < src.length()) {
+ String ext = src.substring(i);
+ title += ext;
+ }
+ }
+ pictSource = title;
+ }
+ }
}
}
@@ -227,6 +248,13 @@ public class WordMLParser extends AbstractXML2003Parser {
if (!WORD_ML_URL.equals(uri)) {
return;
}
+ //somewhat tricky...
+ //can't just dump bin_data at the end of the
+ //bin_data element because there may be metadata
+ //after it, if it is within a pict element
+ //<pict><binData></binData><imagedata/></pict>.
+ //However, if you aren't in a pict (say docOLEdata), then do dump binary
+ //data at the end of the bin data.
if (PICT.equals(localName)) {
inPict = false;
AttributesImpl attrs = new AttributesImpl();
@@ -238,17 +266,29 @@ public class WordMLParser extends AbstractXML2003Parser {
IMG, IMG, attrs);
handler.endElement(
XHTMLContentHandler.XHTML, IMG, IMG);
+ handleEmbedded();
} else if (BIN_DATA.equals(localName)) {
inBin = false;
- byte[] bytes = base64.decode(buffer.toString());
- if (bytes == null) {
- return;
+ rawBytes = base64.decode(buffer.toString());
+ //reset
+ buffer.setLength(0);
+
+ if (! inPict) {
+ handleEmbedded();
}
- try (TikaInputStream is = TikaInputStream.get(bytes)) {
+ }
+ }
+
+ private void handleEmbedded() throws SAXException {
+ if (rawBytes != null) {
+ try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
Metadata metadata = new Metadata();
if (pictName != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, pictName);
}
+ if (pictSource != null) {
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
+ }
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
embeddedDocumentExtractor.parseEmbedded(is,
handler, metadata, false);
@@ -256,8 +296,11 @@ public class WordMLParser extends AbstractXML2003Parser {
} catch (IOException e) {
//log
}
- buffer.setLength(0);
}
+ //reset
+ pictName = null;
+ pictSource = null;
+ rawBytes = null;
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 147d2e8..6426687 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -40,6 +40,7 @@ import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
/**
@@ -242,6 +243,7 @@ class RTFObjDataParser {
fileNameToUse = displayName == null ? "" : displayName;
pathToUse = ansiFilePath == null ? "" : ansiFilePath;
}
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse);
metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
index c80c94a..bd8156d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.junit.Test;
public class AppleSingleFileParserTest extends TikaTest {
@@ -36,5 +37,7 @@ public class AppleSingleFileParserTest extends TikaTest {
assertContains(AppleSingleFileParser.class.getName(),
Arrays.asList(list.get(0).getValues("X-Parsed-By")));
assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("fltsyllabussortie2rev1_2.pdf", list.get(1).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 138120e..9d9d372 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -16,13 +16,15 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-import java.io.InputStream;
-import java.util.Locale;
-
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
@@ -492,5 +494,13 @@ public class WordParserTest extends TikaTest {
assertEquals("manager1", managers[0]);
assertEquals("manager2", managers[1]);
}
+
+ @Test
+ public void testOrigLocation() throws Exception {
+ Metadata metadata = getXML("testException2.doc").metadata;
+ List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
+ assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 04530ce..510cd32 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -17,7 +17,6 @@
package org.apache.tika.parser.microsoft.xml;
import org.apache.tika.TikaTest;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -26,11 +25,11 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+
import java.util.Arrays;
import java.util.List;
-import static org.junit.Assert.assertEquals;
-
public class XML2003ParserTest extends TikaTest {
@Test
@@ -80,6 +79,8 @@ public class XML2003ParserTest extends TikaTest {
assertContains("R1 c1 R1 c2", txt);
assertNotContained("footnoteFigure", txt);
assertContains("footnote Figure", txt);
+
+ assertEquals("testJPEG_EXIF.jpg", list.get(7).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
assertEquals("image/jpeg", list.get(7).get(Metadata.CONTENT_TYPE));
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index dc75be5..d80842b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -18,13 +18,13 @@ package org.apache.tika.parser.rtf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FilenameUtils;
@@ -381,83 +381,45 @@ public class RTFParserTest extends TikaTest {
// TIKA-1010
@Test
public void testEmbeddedMonster() throws Exception {
- Set<MediaType> skipTypes = new HashSet<MediaType>();
- skipTypes.add(MediaType.parse("application/x-emf"));
- skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
-
- List<String> trueNames = new ArrayList<String>();
- trueNames.add("file_0.doc");
- trueNames.add("Hw.txt");
- trueNames.add("file_1.xlsx");
- trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip");
- trueNames.add("html-within-zip.zip");
- trueNames.add("text.html");
- trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html");
- trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
- trueNames.add("file_2.xls");
- trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg");
- trueNames.add("file_3.pdf");
- trueNames.add("file_4.ppt");
- trueNames.add("file_5.pptx");
- trueNames.add("thumbnail.jpeg");
- trueNames.add("file_6.doc");
- trueNames.add("file_7.doc");
- trueNames.add("file_8.docx");
- trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
- List<String> trueTypes = new ArrayList<String>();
- trueTypes.add("application/msword");
- trueTypes.add("text/plain");
- trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- trueTypes.add("application/zip");
- trueTypes.add("application/zip");
- trueTypes.add("text/html");
- trueTypes.add("text/html");
- trueTypes.add("image/jpeg");
- trueTypes.add("application/vnd.ms-excel");
- trueTypes.add("application/vnd.ms-outlook");
- trueTypes.add("application/pdf");
- trueTypes.add("application/vnd.ms-powerpoint");
- trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
- trueTypes.add("image/jpeg");
- trueTypes.add("application/msword");
- trueTypes.add("application/msword");
- trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- trueTypes.add("image/jpeg");
-
- TrackingHandler tracker = new TrackingHandler(skipTypes);
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
- }
- assertEquals(trueNames.size(), tracker.filenames.size());
- assertEquals(trueTypes.size(), tracker.mediaTypes.size());
- for (int i = 0; i < tracker.filenames.size(); i++) {
- String expectedName = trueNames.get(i);
- if (expectedName == null) {
- assertNull(tracker.filenames.get(i));
- } else {
- assertNotNull(tracker.filenames.get(i));
- //necessary to getName() because MSOffice extractor includes
- //directory: _1457338524/HW.txt
- assertEquals("filename equals ",
- expectedName, FilenameUtils.getName(tracker.filenames.get(i)));
- }
- assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString());
- }
-
- tracker = new TrackingHandler();
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
+ Map<Integer, Pair> expected = new HashMap<>();
+ expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+ expected.put(3, new Pair("file_0.doc", "application/msword"));
+ expected.put(6, new Pair("file_1.xlsx",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+ expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+ expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+ expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+ expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+ expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+ expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+ expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+ expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+ expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+ expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+ expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+ expected.put(36, new Pair("file_6.doc", "application/msword"));
+ expected.put(39, new Pair("file_7.doc", "application/msword"));
+ expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+ List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+ assertEquals(48, metadataList.size());
+ for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+ Metadata metadata = metadataList.get(e.getKey());
+ Pair p = e.getValue();
+ assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+ //necessary to getName() because MSOffice extractor includes
+ //directory: _1457338524/HW.txt
+ assertEquals("filename equals ",
+ p.fileName, FilenameUtils.getName(
+ metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+ assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
}
- assertEquals(47, tracker.filenames.size());
- assertEquals("thumbnail_26.emf", tracker.filenames.get(45));
- assertEquals("thumbnail_27.wmf", tracker.filenames.get(46));
+ assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+ metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
}
//TIKA-1010 test regular (not "embedded") images/picts
@@ -537,4 +499,12 @@ public class RTFParserTest extends TikaTest {
assertEquals(2, tracker.filenames.size());
}
+ private static class Pair {
+ final String fileName;
+ final String mimeType;
+ Pair(String fileName, String mimeType) {
+ this.fileName = fileName;
+ this.mimeType = mimeType;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 9a73bde..832b06e 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,6 +33,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
@@ -176,7 +178,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- private void extractMultiOSPDEmbeddedFiles(String defaultName,
+ private void extractMultiOSPDEmbeddedFiles(String displayName,
PDComplexFileSpecification spec,
EmbeddedDocumentExtractor extractor) throws IOException,
SAXException, TikaException {
@@ -185,13 +187,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
//current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
}
- private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+ private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
+ String fileName, PDEmbeddedFile file,
EmbeddedDocumentExtractor extractor)
throws SAXException, IOException, TikaException {
@@ -199,8 +202,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//skip silently
return;
}
-
- fileName = (fileName == null) ? defaultName : fileName;
+
+ fileName = (fileName == null) ? displayName : fileName;
// TODO: other metadata?
Metadata metadata = new Metadata();
@@ -209,6 +212,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
if (extractor.shouldParseEmbedded(metadata)) {
TikaInputStream stream = null;
@@ -289,7 +293,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
try {
- extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+ extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
} catch (SAXException e) {
throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
} catch (TikaException e) {
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
index a385313..a407ded 100644
Binary files a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf and b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx differ
[2/5] tika git commit: TIKA-2022 - clean up -- make entries private,
move more into EndianUtils
Posted by ta...@apache.org.
TIKA-2022 - clean up -- make entries private, move more into EndianUtils
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c84855f6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c84855f6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c84855f6
Branch: refs/heads/2.x
Commit: c84855f6757c714a9fdcec55ca14b628a107642e
Parents: 865c45c
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:13:01 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:13:01 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/io/EndianUtils.java | 19 +++++++++++
.../org/apache/tika/io/EndianUtilsTest.java | 16 +++++++++
.../parser/apple/AppleSingleFileParser.java | 35 ++++++++++----------
3 files changed, 52 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/c84855f6/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 2ab85b3..05da5e0 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -93,6 +93,25 @@ public class EndianUtils {
}
/**
+ * Get a BE unsigned int value from an InputStream
+ *
+ * @param stream the InputStream from which the int is to be read
+ * @return the int (32-bit) value
+ * @throws IOException will be propagated back to the caller
+ * @throws BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static long readUIntBE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)) & 0x00FFFFFFFFl;
+ }
+
+ /**
* Get a LE int value from an InputStream
*
* @param stream the InputStream from which the int is to be read
http://git-wip-us.apache.org/repos/asf/tika/blob/c84855f6/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
index 8f481c3..50084d2 100644
--- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
@@ -54,6 +54,22 @@ public class EndianUtilsTest {
} catch (EndianUtils.BufferUnderrunException e) {
}
+ }
+
+ @Test
+ public void testReadUIntBE() throws Exception {
+ byte[] data = new byte[] {(byte)0x00, (byte)0x00, (byte)0x00, (byte)0x08 };
+ assertEquals((long) 8, EndianUtils.readUIntBE(new ByteArrayInputStream(data)));
+ data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xF0 };
+ assertEquals(4294967280L, EndianUtils.readUIntBE(new ByteArrayInputStream(data)));
+
+ data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF };
+ try {
+ EndianUtils.readUIntLE(new ByteArrayInputStream(data));
+ fail("Should have thrown exception");
+ } catch (EndianUtils.BufferUnderrunException e) {
+
+ }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/c84855f6/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index ffb5759..3f548ca 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -50,21 +50,21 @@ public class AppleSingleFileParser extends AbstractParser {
/**
* Entry types
*/
- public static final int DATA_FORK = 1;
- public static final int RESOURCE_FORK = 2;
- public static final int REAL_NAME = 3;
- public static final int COMMENT = 4;
- public static final int ICON_BW = 5;
- public static final int ICON_COLOR = 6;
+ private static final int DATA_FORK = 1;
+ private static final int RESOURCE_FORK = 2;
+ private static final int REAL_NAME = 3;
+ private static final int COMMENT = 4;
+ private static final int ICON_BW = 5;
+ private static final int ICON_COLOR = 6;
//7?!
- public static final int FILE_DATES_INFO = 8;
- public static final int FINDER_INFO = 9;
- public static final int MACINTOSH_FILE_INFO = 10;
- public static final int PRODOS_FILE_INFO = 11;
- public static final int MSDOS_FILE_INFO = 12;
- public static final int SHORT_NAME = 13;
- public static final int AFP_FILE_INFO = 14;
- public static final int DIRECTORY_ID = 15;
+ private static final int FILE_DATES_INFO = 8;
+ private static final int FINDER_INFO = 9;
+ private static final int MACINTOSH_FILE_INFO = 10;
+ private static final int PRODOS_FILE_INFO = 11;
+ private static final int MSDOS_FILE_INFO = 12;
+ private static final int SHORT_NAME = 13;
+ private static final int AFP_FILE_INFO = 14;
+ private static final int DIRECTORY_ID = 15;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("applefile"));
@@ -94,7 +94,6 @@ public class AppleSingleFileParser extends AbstractParser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
if (contentFieldInfo != null) {
- System.out.println(contentFieldInfo.offset + " "+bytesRead);
long diff = contentFieldInfo.offset-bytesRead;
IOUtils.skipFully(stream, diff);
if (ex.shouldParseEmbedded(embeddedMetadata)) {
@@ -153,9 +152,9 @@ public class AppleSingleFileParser extends AbstractParser {
//convert 32-bit unsigned ints to longs
fieldInfoList.add(
new FieldInfo(
- EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //entry id
- EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //offset
- EndianUtils.readIntBE(stream) & 0x00000000ffffffffL //length
+ EndianUtils.readUIntBE(stream), //entry id
+ EndianUtils.readUIntBE(stream), //offset
+ EndianUtils.readUIntBE(stream) //length
)
);
}