You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/10/05 15:49:02 UTC
svn commit: r1179225 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
Author: jukka
Date: Wed Oct 5 13:49:01 2011
New Revision: 1179225
URL: http://svn.apache.org/viewvc?rev=1179225&view=rev
Log:
TIKA-739: For certain DWG files, the Tika content parser outputs garbage
Check the return value of skipToPropertyInfoSection() also for AC102x versions.
Added extra DWG magic byte patterns from Wikipedia.
Also some whitespace cleanup.
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1179225&r1=1179224&r2=1179225&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed Oct 5 13:49:01 2011
@@ -3327,6 +3327,13 @@
<alias type="application/x-autocad"/>
<glob pattern="*.dwg"/>
<magic priority="50">
+ <match value="MC0.0" type="string" offset="0"/>
+ <match value="AC1.2" type="string" offset="0"/>
+ <match value="AC1.40" type="string" offset="0"/>
+ <match value="AC1.50" type="string" offset="0"/>
+ <match value="AC2.10" type="string" offset="0"/>
+ <match value="AC2.21" type="string" offset="0"/>
+ <match value="AC2.22" type="string" offset="0"/>
<!-- "AC" followed by four numbers -->
<match value="AC0000" type="string" offset="0"
mask="0xFFFFF0F0F0F0"/>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1179225&r1=1179224&r2=1179225&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java Wed Oct 5 13:49:01 2011
@@ -58,6 +58,7 @@ public class DWGParser extends AbstractP
null, // Unknown?
Metadata.RELATION, // Hyperlink
};
+
/** For the 2000 file, they're indexed */
private static final String[] HEADER_2000_PROPERTIES_ENTRIES = {
null,
@@ -70,17 +71,19 @@ public class DWGParser extends AbstractP
Metadata.KEYWORDS, // 0x07
Metadata.LAST_AUTHOR, // 0x08
};
- private static final String HEADER_2000_PROPERTIES_MARKER_STR =
- "DWGPROPS COOKIE";
- private static final byte[] HEADER_2000_PROPERTIES_MARKER =
- new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
- static {
- StringUtil.putCompressedUnicode(
- HEADER_2000_PROPERTIES_MARKER_STR,
- HEADER_2000_PROPERTIES_MARKER, 0
- );
- }
-
+
+ private static final String HEADER_2000_PROPERTIES_MARKER_STR =
+ "DWGPROPS COOKIE";
+
+ private static final byte[] HEADER_2000_PROPERTIES_MARKER =
+ new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+
+ static {
+ StringUtil.putCompressedUnicode(
+ HEADER_2000_PROPERTIES_MARKER_STR,
+ HEADER_2000_PROPERTIES_MARKER, 0);
+ }
+
/**
* How far to skip after the last standard property, before
* we find any custom properties that might be there.
@@ -101,18 +104,19 @@ public class DWGParser extends AbstractP
if (version.equals("AC1015")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if(skipTo2000PropertyInfoSection(stream, header)){
+ if (skipTo2000PropertyInfoSection(stream, header)) {
get2000Props(stream,metadata,xhtml);
}
} else if (version.equals("AC1018")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if(skipToPropertyInfoSection(stream, header)){
+ if (skipToPropertyInfoSection(stream, header)) {
get2004Props(stream,metadata,xhtml);
}
} else if (version.equals("AC1021") || version.equals("AC1024")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- skipToPropertyInfoSection(stream, header);
- get2007and2010Props(stream,metadata,xhtml);
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2007and2010Props(stream,metadata,xhtml);
+ }
} else {
throw new TikaException(
"Unsupported AutoCAD drawing version: " + version);
@@ -132,7 +136,7 @@ public class DWGParser extends AbstractP
String headerValue = read2004String(stream);
handleHeader(i, headerValue, metadata, xhtml);
}
-
+
// Custom properties
int customCount = skipToCustomProperties(stream);
for (int i = 0; i < customCount; i++) {
@@ -143,6 +147,7 @@ public class DWGParser extends AbstractP
}
}
}
+
private String read2004String(InputStream stream) throws IOException {
int stringLen = LittleEndian.readUShort(stream);
@@ -168,7 +173,7 @@ public class DWGParser extends AbstractP
String headerValue = read2007and2010String(stream);
handleHeader(i, headerValue, metadata, xhtml);
}
-
+
// Custom properties
int customCount = skipToCustomProperties(stream);
for (int i = 0; i < customCount; i++) {
@@ -179,13 +184,14 @@ public class DWGParser extends AbstractP
}
}
}
+
private String read2007and2010String(InputStream stream) throws IOException {
int stringLen = LittleEndian.readUShort(stream);
byte[] stringData = new byte[stringLen * 2];
IOUtils.readFully(stream, stringData);
String value = StringUtil.getFromUnicodeLE(stringData);
-
+
// Some strings are null terminated
if(value.charAt(value.length()-1) == 0) {
value = value.substring(0, value.length()-1);
@@ -210,7 +216,7 @@ public class DWGParser extends AbstractP
// We think this means the end of properties
break;
}
-
+
byte[] value = new byte[length];
IOUtils.readFully(stream, value);
if(valueType == 0x1e) {
@@ -236,7 +242,7 @@ public class DWGParser extends AbstractP
propCount++;
}
}
-
+
private void handleHeader(
int headerNumber, String value, Metadata metadata,
XHTMLContentHandler xhtml) throws SAXException {
@@ -270,6 +276,7 @@ public class DWGParser extends AbstractP
}
return true;
}
+
/**
* We think it can be anywhere...
*/
@@ -292,6 +299,7 @@ public class DWGParser extends AbstractP
}
return false;
}
+
private int skipToCustomProperties(InputStream stream)
throws IOException {
// There should be 4 zero bytes next