You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/26 12:24:27 UTC
[tika] 04/04: TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 079db8d8286d681dd05568b532e11fbf02f23fd0
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 08:24:05 2022 -0400
TIKA-3731 -- expand metadata extraction for DWG AC1027 and AC1032; add prefix for custom metadata
---
CHANGES.txt | 2 ++
.../java/org/apache/tika/parser/dwg/DWGParser.java | 10 +++++----
.../org/apache/tika/parser/dwg/DWGParserTest.java | 25 +++++++++++++++++----
.../resources/test-documents/testDWG-AC1027.dwg | Bin 0 -> 265260 bytes
.../resources/test-documents/testDWG-AC1032.dwg | Bin 0 -> 158593 bytes
5 files changed, 29 insertions(+), 8 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index d78cc9351..547383cca 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,8 @@ Release 2.4.0 - ???
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
for the dependencies that must be provided at run-time (TIKA-3676).
+ * NOTE: Added prefix "dwg-custom:" to DWG custom metadata properties (TIKA-3731).
+
* Add initial, BETA-grade TLS encryption option for tika-server;
configuration may change in future releases (TIKA-3719).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 385418c7e..4519623fc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -43,7 +43,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* lots of the low level string/int/short concepts are the same.
*/
public class DWGParser extends AbstractParser {
-
+ public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:";
/**
* Serial version UID
*/
@@ -115,6 +115,8 @@ public class DWGParser extends AbstractParser {
get2004Props(stream, metadata, xhtml);
}
break;
+ case "AC1027":
+ case "AC1032":
case "AC1021":
case "AC1024":
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
@@ -146,7 +148,7 @@ public class DWGParser extends AbstractParser {
String propName = read2004String(stream);
String propValue = read2004String(stream);
if (propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
+ metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
}
}
}
@@ -182,7 +184,7 @@ public class DWGParser extends AbstractParser {
String propName = read2007and2010String(stream);
String propValue = read2007and2010String(stream);
if (propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
+ metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
}
}
}
@@ -233,7 +235,7 @@ public class DWGParser extends AbstractParser {
if (splitAt > -1) {
String propName = val.substring(0, splitAt);
String propVal = val.substring(splitAt + 1);
- metadata.add(propName, propVal);
+ metadata.add(DWGParser.DWG_CUSTOM_META_PREFIX + propName, propVal);
}
}
} else {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index f28019f0b..88807b087 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -26,11 +26,12 @@ import java.util.Arrays;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.BodyContentHandler;
-public class DWGParserTest {
+public class DWGParserTest extends TikaTest {
@Test
public void testDWG2000Parser() throws Exception {
@@ -81,8 +82,10 @@ public class DWGParserTest {
ContentHandler handler = new BodyContentHandler();
new DWGParser().parse(input, handler, metadata, null);
- assertEquals("valueforcustomprop1", metadata.get("customprop1"));
- assertEquals("valueforcustomprop2", metadata.get("customprop2"));
+ assertEquals("valueforcustomprop1",
+ metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop1"));
+ assertEquals("valueforcustomprop2",
+ metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop2"));
}
}
@@ -165,7 +168,8 @@ public class DWGParserTest {
assertEquals("This is a comment", metadata.get(TikaCoreProperties.COMMENTS));
assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("http://mycompany/drawings", metadata.get(TikaCoreProperties.RELATION));
- assertEquals("MyCustomPropertyValue", metadata.get("MyCustomProperty"));
+ assertEquals("MyCustomPropertyValue",
+ metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "MyCustomProperty"));
String content = handler.toString();
assertContains("This is a comment", content);
@@ -174,4 +178,17 @@ public class DWGParserTest {
input.close();
}
}
+
+ @Test
+ public void testAC1027() throws Exception {
+ Metadata metadata = getXML("testDWG-AC1027.dwg").metadata;
+ assertEquals("hlu", metadata.get(TikaCoreProperties.MODIFIER));
+ }
+
+ @Test
+ public void testAC1032() throws Exception {
+ Metadata metadata = getXML("testDWG-AC1032.dwg").metadata;
+ assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S ADDRESS"));
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg
new file mode 100644
index 000000000..9409e3fec
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1027.dwg differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg
new file mode 100644
index 000000000..5e644bf21
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDWG-AC1032.dwg differ