You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/18 16:48:27 UTC

[tika] branch main updated: TIKA-3308 -- add detection for svg files that lack the xml header (#808)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0145868ab TIKA-3308 -- add detection for svg files that lack the xml header (#808)
0145868ab is described below

commit 0145868ab5c1f2718dc3267e50737d22effb3ce6
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Nov 18 11:48:21 2022 -0500

    TIKA-3308 -- add detection for svg files that lack the xml header (#808)
---
 CHANGES.txt                                                         | 2 ++
 .../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml      | 6 ++++++
 .../src/test/resources/test-documents/testSVG_no_xml_header.svg     | 4 ++++
 .../src/test/java/org/apache/tika/mime/TestMimeTypes.java           | 1 +
 4 files changed, 13 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index af854c029..5a355a3b8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.6.1 - ???
 
+   * Add SVG detection for svg files lacking the xml header (TIKA-3308).
+
    * Upgrade to Bouncy Castle 1.71 and jdk18on jars (TIKA-3933).
 
    * Add a JDBCPipesReporter (TIKA-3931).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 2baa84d0e..4d1347c93 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5827,6 +5827,12 @@
     <acronym>SVG</acronym>
     <_comment>Scalable Vector Graphics</_comment>
     <root-XML localName="svg" namespaceURI="http://www.w3.org/2000/svg"/>
+    <magic priority="50">
+      <!-- Version of 0x0001 is PSD -->
+      <match value="&lt;svg" type="string" offset="0">
+        <match value="http://www.w3.org/2000/svg" type="string" offset="5:256"/>
+      </match>
+    </magic>
     <glob pattern="*.svg"/>
     <glob pattern="*.svgz"/>
   </mime-type>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg
new file mode 100644
index 000000000..0e53461be
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg
@@ -0,0 +1,4 @@
+<svg width="1cm" height="1cm" version="1.1" xmlns="http://www.w3.org/2000/svg">
+    <desc>Test SVG image</desc>
+    <rect x="0.1cm" y="0.1cm" width="0.8cm" height="0.8cm"/>
+</svg>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 1f475f2bd..d14c5eb9b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -607,6 +607,7 @@ public class TestMimeTypes {
         assertTypeByData("image/svg+xml", "testSVG.svg");
         assertTypeByName("image/svg+xml", "x.svg");
         assertTypeByName("image/svg+xml", "x.SVG");
+        assertTypeByData("image/svg+xml", "testSVG_no_xml_header.svg");
 
         // Should *.svgz be svg or gzip
         assertType("application/gzip", "testSVG.svgz");