You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/18 15:21:27 UTC

[tika] 01/01: TIKA-3308 -- add detection for svg files that lack the xml header

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3308
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 04341240651f12bd63585ef3b4dc78df83630c7c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Nov 18 10:21:11 2022 -0500

    TIKA-3308 -- add detection for svg files that lack the xml header
---
 CHANGES.txt                                                         | 2 ++
 .../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml      | 6 ++++++
 .../src/test/resources/test-documents/testSVG_no_xml_header.svg     | 4 ++++
 .../src/test/java/org/apache/tika/mime/TestMimeTypes.java           | 1 +
 4 files changed, 13 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 54d14214e..85c7796a6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.6.1 - ???
 
+   * Add SVG detection for svg files lacking the xml header (TIKA-3308).
+
    * Add a JDBCPipesReporter (TIKA-3931).
 
    * Add multivalued field strategy option in jdbc-emitter (TIKA-3930).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 2baa84d0e..4d1347c93 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5827,6 +5827,12 @@
     <acronym>SVG</acronym>
     <_comment>Scalable Vector Graphics</_comment>
     <root-XML localName="svg" namespaceURI="http://www.w3.org/2000/svg"/>
+    <magic priority="50">
+      <!-- Version of 0x0001 is PSD -->
+      <match value="&lt;svg" type="string" offset="0">
+        <match value="http://www.w3.org/2000/svg" type="string" offset="5:256"/>
+      </match>
+    </magic>
     <glob pattern="*.svg"/>
     <glob pattern="*.svgz"/>
   </mime-type>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg
new file mode 100644
index 000000000..0e53461be
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg
@@ -0,0 +1,4 @@
+<svg width="1cm" height="1cm" version="1.1" xmlns="http://www.w3.org/2000/svg">
+    <desc>Test SVG image</desc>
+    <rect x="0.1cm" y="0.1cm" width="0.8cm" height="0.8cm"/>
+</svg>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 1f475f2bd..d14c5eb9b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -607,6 +607,7 @@ public class TestMimeTypes {
         assertTypeByData("image/svg+xml", "testSVG.svg");
         assertTypeByName("image/svg+xml", "x.svg");
         assertTypeByName("image/svg+xml", "x.SVG");
+        assertTypeByData("image/svg+xml", "testSVG_no_xml_header.svg");
 
         // Should *.svgz be svg or gzip
         assertType("application/gzip", "testSVG.svgz");