You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/26 12:24:27 UTC

[tika] 04/04: TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 079db8d8286d681dd05568b532e11fbf02f23fd0
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 08:24:05 2022 -0400

    TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata
---
 CHANGES.txt                                        |   2 ++
 .../java/org/apache/tika/parser/dwg/DWGParser.java |  10 +++++----
 .../org/apache/tika/parser/dwg/DWGParserTest.java  |  25 +++++++++++++++++----
 .../resources/test-documents/testDWG-AC1027.dwg    | Bin 0 -> 265260 bytes
 .../resources/test-documents/testDWG-AC1032.dwg    | Bin 0 -> 158593 bytes
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index d78cc9351..547383cca 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,8 @@ Release 2.4.0 - ???
      https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
      for the dependencies that must be provided at run-time (TIKA-3676).
 
+   * NOTE: Added prefix "dwg-custom:" to DWG custom metadata properties (TIKA-3731).
+
    * Add initial, BETA-grade TLS encryption option for tika-server;
      configuration may change in future releases (TIKA-3719).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 385418c7e..4519623fc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -43,7 +43,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
  * lots of the low level string/int/short concepts are the same.
  */
 public class DWGParser extends AbstractParser {
-
+    public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:";
     /**
      * Serial version UID
      */
@@ -115,6 +115,8 @@ public class DWGParser extends AbstractParser {
                     get2004Props(stream, metadata, xhtml);
                 }
                 break;
+            case "AC1027":
+            case "AC1032":
             case "AC1021":
             case "AC1024":
                 metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
@@ -146,7 +148,7 @@ public class DWGParser extends AbstractParser {
             String propName = read2004String(stream);
             String propValue = read2004String(stream);
             if (propName.length() > 0 && propValue.length() > 0) {
-                metadata.add(propName, propValue);
+                metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
             }
         }
     }
@@ -182,7 +184,7 @@ public class DWGParser extends AbstractParser {
             String propName = read2007and2010String(stream);
             String propValue = read2007and2010String(stream);
             if (propName.length() > 0 && propValue.length() > 0) {
-                metadata.add(propName, propValue);
+                metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
             }
         }
     }
@@ -233,7 +235,7 @@ public class DWGParser extends AbstractParser {
                     if (splitAt > -1) {
                         String propName = val.substring(0, splitAt);
                         String propVal = val.substring(splitAt + 1);
-                        metadata.add(propName, propVal);
+                        metadata.add(DWGParser.DWG_CUSTOM_META_PREFIX + propName, propVal);
                     }
                 }
             } else {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index f28019f0b..88807b087 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -26,11 +26,12 @@ import java.util.Arrays;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.BodyContentHandler;
 
-public class DWGParserTest {
+public class DWGParserTest extends TikaTest {
 
     @Test
     public void testDWG2000Parser() throws Exception {
@@ -81,8 +82,10 @@ public class DWGParserTest {
             ContentHandler handler = new BodyContentHandler();
             new DWGParser().parse(input, handler, metadata, null);
 
-            assertEquals("valueforcustomprop1", metadata.get("customprop1"));
-            assertEquals("valueforcustomprop2", metadata.get("customprop2"));
+            assertEquals("valueforcustomprop1",
+                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop1"));
+            assertEquals("valueforcustomprop2",
+                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop2"));
         }
     }
 
@@ -165,7 +168,8 @@ public class DWGParserTest {
             assertEquals("This is a comment", metadata.get(TikaCoreProperties.COMMENTS));
             assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER));
             assertEquals("http://mycompany/drawings", metadata.get(TikaCoreProperties.RELATION));
-            assertEquals("MyCustomPropertyValue", metadata.get("MyCustomProperty"));
+            assertEquals("MyCustomPropertyValue",
+                    metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "MyCustomProperty"));
 
             String content = handler.toString();
             assertContains("This is a comment", content);
@@ -174,4 +178,17 @@ public class DWGParserTest {
             input.close();
         }
     }
+
+    @Test
+    public void testAC1027() throws Exception {
+        Metadata metadata = getXML("testDWG-AC1027.dwg").metadata;
+        assertEquals("hlu", metadata.get(TikaCoreProperties.MODIFIER));
+    }
+
+    @Test
+    public void testAC1032() throws Exception {
+        Metadata metadata = getXML("testDWG-AC1032.dwg").metadata;
+        assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S ADDRESS"));
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg
new file mode 100644
index 000000000..9409e3fec
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg
new file mode 100644
index 000000000..5e644bf21
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg differ