You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/10/07 23:05:22 UTC
svn commit: r1180243 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
Author: nick
Date: Fri Oct 7 21:05:22 2011
New Revision: 1180243
URL: http://svn.apache.org/viewvc?rev=1180243&view=rev
Log:
TIKA-749 Convert the DWG and PRT parsers to use the Tika endian util, rather than the POI one
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java?rev=1180243&r1=1180242&r2=1180243&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java Fri Oct 7 21:05:22 2011
@@ -169,8 +169,204 @@ public class EndianUtils {
(ch7 << 8) +
(ch8 << 0);
}
+
+
+ /**
+ * Get a LE short value from the beginning of a byte array
+ *
+ *@param data the byte array
+ *@return the short (16-bit) value
+ */
+ public static short getShortLE(byte[] data) {
+ return getShortLE(data, 0);
+ }
+ /**
+ * Get a LE short value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the short (16-bit) value
+ */
+ public static short getShortLE(byte[] data, int offset) {
+ return (short)getUShortLE(data, offset);
+ }
+
+ /**
+ * Get a LE unsigned short value from the beginning of a byte array
+ *
+ *@param data the byte array
+ *@return the unsigned short (16-bit) value in an int
+ */
+ public static int getUShortLE(byte[] data) {
+ return getUShortLE(data, 0);
+ }
+ /**
+ * Get a LE unsigned short value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the unsigned short (16-bit) value in an integer
+ */
+ public static int getUShortLE(byte[] data, int offset) {
+ int b0 = data[offset] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ return (b1 << 8) + (b0 << 0);
+ }
+
+ /**
+ * Get a BE short value from the beginning of a byte array
+ *
+ *@param data the byte array
+ *@return the short (16-bit) value
+ */
+ public static short getShortBE(byte[] data) {
+ return getShortBE(data, 0);
+ }
+ /**
+ * Get a BE short value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the short (16-bit) value
+ */
+ public static short getShortBE(byte[] data, int offset) {
+ return (short)getUShortBE(data, offset);
+ }
+
+ /**
+ * Get a BE unsigned short value from the beginning of a byte array
+ *
+ *@param data the byte array
+ *@return the unsigned short (16-bit) value in an int
+ */
+ public static int getUShortBE(byte[] data) {
+ return getUShortBE(data, 0);
+ }
+ /**
+ * Get a BE unsigned short value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the unsigned short (16-bit) value in an integer
+ */
+ public static int getUShortBE(byte[] data, int offset) {
+ int b0 = data[offset] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ return (b0 << 8) + (b1 << 0);
+ }
+
+ /**
+ * Get a LE int value from the beginning of a byte array
+ *
+ *@param data the byte array
+ *@return the int (32-bit) value
+ */
+ public static int getIntLE(byte[] data) {
+ return getIntLE(data, 0);
+ }
+ /**
+ * Get a LE int value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the int (32-bit) value
+ */
+ public static int getIntLE(byte[] data, int offset) {
+ int i=offset;
+ int b0 = data[i++] & 0xFF;
+ int b1 = data[i++] & 0xFF;
+ int b2 = data[i++] & 0xFF;
+ int b3 = data[i++] & 0xFF;
+ return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0);
+ }
/**
+ * Get a BE int value from the beginning of a byte array
+ *
+ *@param data the byte array
+ *@return the int (32-bit) value
+ */
+ public static int getIntBE(byte[] data) {
+ return getIntBE(data, 0);
+ }
+ /**
+ * Get a BE int value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the int (32-bit) value
+ */
+ public static int getIntBE(byte[] data, int offset) {
+ int i=offset;
+ int b0 = data[i++] & 0xFF;
+ int b1 = data[i++] & 0xFF;
+ int b2 = data[i++] & 0xFF;
+ int b3 = data[i++] & 0xFF;
+ return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+ }
+
+ /**
+ * Get a LE unsigned int value from a byte array
+ *
+ *@param data the byte array
+ *@return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntLE(byte[] data) {
+ return getUIntLE(data,0);
+ }
+ /**
+ * Get a LE unsigned int value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntLE(byte[] data, int offset) {
+ long retNum = getIntLE(data, offset);
+ return retNum & 0x00FFFFFFFFl;
+ }
+
+ /**
+ * Get a BE unsigned int value from a byte array
+ *
+ *@param data the byte array
+ *@return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntBE(byte[] data) {
+ return getUIntBE(data,0);
+ }
+ /**
+ * Get a BE unsigned int value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the unsigned int (32-bit) value in a long
+ */
+ public static long getUIntBE(byte[] data, int offset) {
+ long retNum = getIntBE(data, offset);
+ return retNum & 0x00FFFFFFFFl;
+ }
+
+ /**
+ * Get a LE long value from a byte array
+ *
+ *@param data the byte array
+ *@param offset a starting offset into the byte array
+ *@return the long (64-bit) value
+ */
+ public static long getLongLE(byte[] data, int offset) {
+ long result = 0;
+
+ for (int j = offset + LONG_SIZE - 1; j >= offset; j--) {
+ result <<= 8;
+ result |= 0xff & data[j];
+ }
+ return result;
+ }
+ private static final int LONG_SIZE = 8;
+
+
+ /**
* Convert an 'unsigned' byte to an integer. ie, don't carry across the
* sign.
*
@@ -195,7 +391,9 @@ public class EndianUtils {
return (short) ( data[offset] & 0xFF );
}
+
public static class BufferUnderrunException extends TikaException {
+ private static final long serialVersionUID = 8358288231138076276L;
public BufferUnderrunException() {
super("Insufficient data left in stream for required read");
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1180243&r1=1180242&r2=1180243&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java Fri Oct 7 21:05:22 2011
@@ -22,9 +22,9 @@ import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -130,7 +130,7 @@ public class DWGParser extends AbstractP
*/
private void get2004Props(
InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
+ throws IOException, TikaException, SAXException {
// Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
String headerValue = read2004String(stream);
@@ -148,8 +148,8 @@ public class DWGParser extends AbstractP
}
}
- private String read2004String(InputStream stream) throws IOException {
- int stringLen = LittleEndian.readUShort(stream);
+ private String read2004String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
byte[] stringData = new byte[stringLen];
IOUtils.readFully(stream, stringData);
@@ -167,7 +167,7 @@ public class DWGParser extends AbstractP
*/
private void get2007and2010Props(
InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
+ throws IOException, TikaException, SAXException {
// Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
String headerValue = read2007and2010String(stream);
@@ -185,8 +185,8 @@ public class DWGParser extends AbstractP
}
}
- private String read2007and2010String(InputStream stream) throws IOException {
- int stringLen = LittleEndian.readUShort(stream);
+ private String read2007and2010String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
byte[] stringData = new byte[stringLen * 2];
IOUtils.readFully(stream, stringData);
@@ -202,11 +202,11 @@ public class DWGParser extends AbstractP
private void get2000Props(
InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
+ throws IOException, TikaException, SAXException {
int propCount = 0;
while(propCount < 30) {
- int propIdx = LittleEndian.readUShort(stream);
- int length = LittleEndian.readUShort(stream);
+ int propIdx = EndianUtils.readUShortLE(stream);
+ int length = EndianUtils.readUShortLE(stream);
int valueType = stream.read();
if(propIdx == 0x28) {
@@ -262,9 +262,9 @@ public class DWGParser extends AbstractP
* Grab the offset, then skip there
*/
private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
- throws IOException {
+ throws IOException, TikaException {
// The offset is stored in the header from 0x20 onwards
- long offsetToSection = LittleEndian.getLong(header, 0x20);
+ long offsetToSection = EndianUtils.getLongLE(header, 0x20);
long toSkip = offsetToSection - header.length;
if(offsetToSection == 0){
return false;
@@ -301,7 +301,7 @@ public class DWGParser extends AbstractP
}
private int skipToCustomProperties(InputStream stream)
- throws IOException {
+ throws IOException, TikaException {
// There should be 4 zero bytes next
byte[] padding = new byte[4];
IOUtils.readFully(stream, padding);
@@ -312,7 +312,7 @@ public class DWGParser extends AbstractP
IOUtils.readFully(stream, padding);
// We should now have the count
- int count = LittleEndian.readUShort(stream);
+ int count = EndianUtils.readUShortLE(stream);
// Sanity check it
if(count > 0 && count < 0x7f) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1180243&r1=1180242&r2=1180243&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java Fri Oct 7 21:05:22 2011
@@ -23,8 +23,8 @@ import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -135,7 +135,7 @@ public class PRTParser extends AbstractP
return;
}
- int length = LittleEndian.readUShort(stream);
+ int length = EndianUtils.readUShortLE(stream);
if(length <= MAX_SANE_TEXT_LENGTH) {
// Length sanity check passed
handleText(length, stream, xhtml);
@@ -146,7 +146,7 @@ public class PRTParser extends AbstractP
XHTMLContentHandler xhtml, Last5 l5)
throws IOException, SAXException, TikaException {
// Is it 8 byte zero padded?
- int maybeLength = LittleEndian.readUShort(stream);
+ int maybeLength = EndianUtils.readUShortLE(stream);
if(maybeLength == 0) {
// Check the next 6 bytes too
for(int i=0; i<6; i++) {
@@ -161,7 +161,7 @@ public class PRTParser extends AbstractP
byte[] b2 = new byte[2];
IOUtils.readFully(stream, b2);
- int length = LittleEndian.getUShort(b2);
+ int length = EndianUtils.getUShortLE(b2);
if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
// Length sanity check passed
handleText(length, stream, xhtml);