You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/03/19 00:16:03 UTC

svn commit: r1083104 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/dwg/ test/java/org/apache/tika/parser/dwg/ test/resources/test-documents/

Author: nick
Date: Fri Mar 18 23:16:02 2011
New Revision: 1083104

URL: http://svn.apache.org/viewvc?rev=1083104&view=rev
Log:
TIKA-592 - Support AutoCad DWG files from AutoCad 2000 (version 1015), and add Custom Properties support across all versions. Adds unit tests for various other "versions" (where the file format doesn't seem to have changed even if the product version has)

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWG2000.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004DX.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2005.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2006.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2007.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2008.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2009.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2010.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2011.dwg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech6.dwg   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1083104&r1=1083103&r2=1083104&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java Fri Mar 18 23:16:02 2011
@@ -18,6 +18,7 @@ package org.apache.tika.parser.dwg;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.Set;
 
@@ -58,6 +59,26 @@ public class DWGParser implements Parser
         null, // Unknown?
         Metadata.RELATION, // Hyperlink
     };
+    /** For the 2000 file, they're indexed */
+    private static String[] HEADER_2000_PROPERTIES_ENTRIES = {
+       null, 
+       Metadata.RELATION, // 0x01
+       Metadata.TITLE,    // 0x02
+       Metadata.SUBJECT,  // 0x03
+       Metadata.AUTHOR,   // 0x04
+       null,
+       Metadata.COMMENTS, // 0x06 
+       Metadata.KEYWORDS, // 0x07
+       Metadata.LAST_AUTHOR, // 0x08
+   };
+    private static byte[] HEADER_2000_PROPERTIES_MARKER = 
+       "DWGPROPS COOKIE".getBytes(Charset.forName("ASCII"));
+    
+    /** 
+     * How far to skip after the last standard property, before
+     *  we find any custom properties that might be there.
+     */
+    private static final int CUSTOM_PROPERTIES_SKIP = 20; 
 
     public void parse(
             InputStream stream, ContentHandler handler,
@@ -71,7 +92,12 @@ public class DWGParser implements Parser
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
-        if (version.equals("AC1018")) {
+        if (version.equals("AC1015")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if(skipTo2000PropertyInfoSection(stream, header)){
+                get2000Props(stream,metadata,xhtml);
+            }
+        } else if (version.equals("AC1018")) {
             metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
             if(skipToPropertyInfoSection(stream, header)){
                 get2004Props(stream,metadata,xhtml);
@@ -94,21 +120,34 @@ public class DWGParser implements Parser
     private void get2004Props(
             InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
             throws IOException, SAXException {
+       // Standard properties
         for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
-            int stringLen = LittleEndian.readUShort(stream);
-
-            byte[] stringData = new byte[stringLen];
-            IOUtils.readFully(stream, stringData);
-
-            // Often but not always null terminated
-            if (stringData[stringLen-1] == 0) {
-                stringLen--;
-            }
-            String headerValue =
-                StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
-
+            String headerValue = read2004String(stream);
             handleHeader(i, headerValue, metadata, xhtml);
         }
+        
+        // Custom properties
+        int customCount = skipToCustomProperties(stream);
+        for (int i = 0; i < customCount; i++) {
+           String propName = read2004String(stream);
+           String propValue = read2004String(stream);
+           if(propName.length() > 0 && propValue.length() > 0) {
+              metadata.add(propName, propValue);
+           }
+        }
+    }
+    private String read2004String(InputStream stream) throws IOException {
+       int stringLen = LittleEndian.readUShort(stream);
+
+       byte[] stringData = new byte[stringLen];
+       IOUtils.readFully(stream, stringData);
+
+       // Often but not always null terminated
+       if (stringData[stringLen-1] == 0) {
+           stringLen--;
+       }
+       String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
+       return value;
     }
 
     /**
@@ -117,17 +156,80 @@ public class DWGParser implements Parser
     private void get2007and2010Props(
             InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
             throws IOException, SAXException {
+        // Standard properties
         for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
-            int stringLen = LittleEndian.readUShort(stream);
-
-            byte[] stringData = new byte[stringLen * 2];
-            IOUtils.readFully(stream, stringData);
-            String headerValue = StringUtil.getFromUnicodeLE(stringData);
-
+            String headerValue = read2007and2010String(stream);
             handleHeader(i, headerValue, metadata, xhtml);
         }
+        
+        // Custom properties
+        int customCount = skipToCustomProperties(stream);
+        for (int i = 0; i < customCount; i++) {
+           String propName = read2007and2010String(stream);
+           String propValue = read2007and2010String(stream);
+           if(propName.length() > 0 && propValue.length() > 0) {
+              metadata.add(propName, propValue);
+           }
+        }
+    }
+    private String read2007and2010String(InputStream stream) throws IOException {
+       int stringLen = LittleEndian.readUShort(stream);
+
+       byte[] stringData = new byte[stringLen * 2];
+       IOUtils.readFully(stream, stringData);
+       String value = StringUtil.getFromUnicodeLE(stringData);
+       
+       // Some strings are null terminated
+       if(value.charAt(value.length()-1) == 0) {
+           value = value.substring(0, value.length()-1);
+       }
+
+       return value;
     }
 
+    private void get2000Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, SAXException {
+        int propCount = 0;
+        while(propCount < 30) {
+            int propIdx = LittleEndian.readUShort(stream);
+            int length = LittleEndian.readUShort(stream);
+            int valueType = stream.read();
+            
+            if(propIdx == 0x28) {
+               // This one seems not to follow the pattern
+               length = 0x19;
+            } else if(propIdx == 90) {
+               // We think this means the end of properties
+               break;
+            }
+            
+            byte[] value = new byte[length];
+            IOUtils.readFully(stream, value);
+            if(valueType == 0x1e) {
+                // Normal string, good
+                String val = StringUtil.getFromCompressedUnicode(value, 0, length);
+                
+                // Is it one we can look up by index?
+                if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+                   metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+                   xhtml.element("p", val);
+                } else if(propIdx == 0x012c) {
+                   int splitAt = val.indexOf('='); 
+                   if(splitAt > -1) {
+                      String propName = val.substring(0, splitAt);
+                      String propVal = val.substring(splitAt+1);
+                      metadata.add(propName, propVal);
+                   }
+                }
+            } else {
+                // No idea...
+            }
+            
+            propCount++;
+        }
+    }
+    
     private void handleHeader(
             int headerNumber, String value, Metadata metadata,
             XHTMLContentHandler xhtml) throws SAXException {
@@ -135,11 +237,6 @@ public class DWGParser implements Parser
             return;
         }
 
-        // Some strings are null terminated
-        if(value.charAt(value.length()-1) == 0) {
-            value = value.substring(0, value.length()-1);
-        }
-
         String headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
         if(headerProp != null) {
             metadata.set(headerProp, value);
@@ -148,6 +245,9 @@ public class DWGParser implements Parser
         xhtml.element("p", value);
     }
 
+    /**
+     * Grab the offset, then skip there
+     */
     private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
             throws IOException {
         // The offset is stored in the header from 0x20 onwards
@@ -163,6 +263,55 @@ public class DWGParser implements Parser
         }
         return true;
     }
+    /**
+     * We think it can be anywhere...
+     */
+    private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
+            throws IOException {
+       int val = 0;
+       while(val != -1) {
+          val = stream.read();
+          if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
+             boolean going = true;
+             for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
+                val = stream.read();
+                if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
+             }
+             if(going) {
+                // Bingo, found it
+                return true;
+             }
+          }
+       }
+       return false;
+    }
+    private int skipToCustomProperties(InputStream stream) 
+            throws IOException {
+       // There should be 4 zero bytes next
+       byte[] padding = new byte[4];
+       IOUtils.readFully(stream, padding);
+       if(padding[0] == 0 && padding[1] == 0 &&
+             padding[2] == 0 && padding[3] == 0) {
+          // Looks hopeful, skip on
+          padding = new byte[CUSTOM_PROPERTIES_SKIP];
+          IOUtils.readFully(stream, padding);
+          
+          // We should now have the count
+          int count = LittleEndian.readUShort(stream);
+          
+          // Sanity check it
+          if(count > 0 && count < 0x7f) {
+             // Looks plausible
+             return count;
+          } else {
+             // No properties / count is too high to trust
+             return 0;
+          }
+       } else {
+          // No padding. That probably means no custom props
+          return 0;
+       }
+    }
 
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java?rev=1083104&r1=1083103&r2=1083104&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java Fri Mar 18 23:16:02 2011
@@ -25,6 +25,11 @@ import org.apache.tika.sax.BodyContentHa
 import org.xml.sax.ContentHandler;
 
 public class DWGParserTest extends TestCase {
+    public void testDWG2000Parser() throws Exception {
+        InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2000.dwg");
+        testParserAlt(input);
+    }
 
     public void testDWG2004Parser() throws Exception {
         InputStream input = DWGParserTest.class.getResourceAsStream(
@@ -50,6 +55,18 @@ public class DWGParserTest extends TestC
         testParser(input);
     }
 
+    public void testDWGMechParser() throws Exception {
+        String[] types = new String[] {
+              "6", "2004", "2004DX", "2005", "2006",
+              "2007", "2008", "2009", "2010", "2011"
+        };
+        for (String type : types) {
+           InputStream input = DWGParserTest.class.getResourceAsStream(
+                   "/test-documents/testDWGmech"+type+".dwg");
+           testParserAlt(input);
+        }
+    }
+
     private void testParser(InputStream input) throws Exception {
         try {
             Metadata metadata = new Metadata();
@@ -101,4 +118,37 @@ public class DWGParserTest extends TestC
             input.close();
         }
     }
+
+    private void testParserAlt(InputStream input) throws Exception {
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DWGParser().parse(input, handler, metadata);
+
+            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("Test Title", 
+                    metadata.get(Metadata.TITLE));
+            assertEquals("Test Subject",
+                    metadata.get(Metadata.SUBJECT));
+            assertEquals("My Author",
+                    metadata.get(Metadata.AUTHOR));
+            assertEquals("My keyword1, MyKeyword2",
+                    metadata.get(Metadata.KEYWORDS));
+            assertEquals("This is a comment",
+                    metadata.get(Metadata.COMMENTS));
+            assertEquals("bejanpol",
+                    metadata.get(Metadata.LAST_AUTHOR));
+            assertEquals("http://mycompany/drawings",
+                    metadata.get(Metadata.RELATION));
+            assertEquals("MyCustomPropertyValue",
+                  metadata.get("MyCustomProperty"));
+
+            String content = handler.toString();
+            assertTrue(content.contains("This is a comment"));
+            assertTrue(content.contains("mycompany"));
+        } finally {
+            input.close();
+        }
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWG2000.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWG2000.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWG2000.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004DX.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004DX.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2004DX.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2005.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2005.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2005.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2006.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2006.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2006.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2007.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2007.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2007.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2008.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2008.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2008.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2009.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2009.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2009.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2010.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2010.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2010.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2011.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2011.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech2011.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech6.dwg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech6.dwg?rev=1083104&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDWGmech6.dwg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream