You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/31 13:45:23 UTC

(tika) branch main updated: TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction (#1567)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new d9289fd46 TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction (#1567)
d9289fd46 is described below

commit d9289fd46e9619c7900086eb6572040984a7754a
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed Jan 31 08:45:18 2024 -0500

    TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction (#1567)
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/mime/tika-mimetypes.xml        | 105 ++++++++++++++++++++-
 .../tika/parser/sqlite3/SQLite3DBParser.java       |  32 +++++++
 .../apache/tika/parser/sqlite3/SQLite3Parser.java  |  15 +++
 .../tika/parser/sqlite3/SQLite3ParserTest.java     |   4 +
 .../apache/tika/parser/jdbc/AbstractDBParser.java  |  14 +++
 6 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index f9ac540e6..163753e9b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023
 
    Other Changes/Updates
 
+   * Improve detection of sqlite3-based file formats (TIKA-4187).
+
    * Upgrade PDFBox to 3.0.1 (TIKA-3347)
    
    * Deprecated AbstractParser for removal in 4.x (TIKA-4132).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 54f7cc6f6..2930fa720 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4858,11 +4858,114 @@
   </mime-type>
 
   <mime-type type="application/x-sqlite3">
+    <!--sqlite subtypes are from: https://www.sqlite.org/src/artifact?ci=trunk&filename=magic.txt -->
     <magic priority="50">
       <match value="SQLite format 3\x00" type="string" offset="0"/>
     </magic>
   </mime-type>
-
+  <!-- geo package >= 1.2.x -->
+  <mime-type type="application/x-geopackage">
+    <url>https://www.geopackage.org/</url>
+    <!-- should be higher than application/x-sqlite3 -->
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x47504b47" type="string" offset="68"/>
+      </match>
+    </magic>
+    <glob pattern="*.gpkg"/>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <!-- geo package 1.0.x or 1.1.x -->
+  <mime-type type="application/x-geopackage; version=1.1Or1.0">
+    <url>https://www.geopackage.org/</url>
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x47503130" type="string" offset="68"/>
+      </match>
+    </magic>
+    <glob pattern="*.gpkg"/>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-checkout">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055112" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-checkout">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055112" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-global-conf">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055113" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-repository">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055111" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-bentley-besqlite">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x42654462" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-bentley-localization">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x42654c6e" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-monotone-source-repo">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x5f4d544e" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-esri-spatially-enabled-db">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x45737269" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-mbtiles">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x4d504258" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-texnicard">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x6a035744" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
   <mime-type type="application/x-stata-do">
     <_comment>Stata DTA Script</_comment>
     <acronym>DO</acronym>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index f4c9d745e..947272a0a 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -27,6 +27,7 @@ import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.sqlite.SQLiteConfig;
@@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.jdbc.AbstractDBParser;
@@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
 class SQLite3DBParser extends AbstractDBParser {
 
     protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+    protected static final Map<Property, String> METADATA_KEYS = Map.of(
+            SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from pragma_application_id",
+            SQLite3Parser.SQLITE_USER_VERSION, "select user_version from pragma_user_version"
+    );
+
     //If the InputStream wasn't a TikaInputStream, copy to this tmp file
     Path tmpFile = null;
 
@@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser {
                                              EmbeddedDocumentUtil embeddedDocumentUtil) {
         return new SQLite3TableReader(connection, tableName, embeddedDocumentUtil);
     }
+
+    @Override
+    protected void extractMetadata(Connection connection, Metadata metadata) {
+        //TODO -- figure out how to get the version of sqlite3 that last modified this file and
+        // version-valid-for.
+        // version-valid-for is at offset 92, last modified by app version isat offset 96 --
+        // not clear how to get this info via sql
+        //'file' extracts this info; we should to :\
+        //See: https://www.sqlite.org/fileformat.html
+        for (Map.Entry<Property, String> e : METADATA_KEYS.entrySet()) {
+            try (Statement st = connection.createStatement()) {
+                try (ResultSet rs = st.executeQuery(e.getValue())) {
+                    if (rs.next()) {
+                        int val = rs.getInt(1);
+                        if (! rs.wasNull()) {
+                            metadata.set(e.getKey(), Integer.toString(val, 16));
+                        }
+                    }
+                }
+            } catch (SQLException ex) {
+                //swallow
+            }
+        }
+    }
 }
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
index 1bd18916e..34aab4d10 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
@@ -32,6 +32,7 @@ import org.apache.tika.config.Param;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -54,6 +55,20 @@ import org.apache.tika.parser.Parser;
  */
 public class SQLite3Parser implements Parser, Initializable {
 
+    public static final String SQLITE3_PREFIX = "sqlite3:";
+
+    /**
+     * Base16 encoded integer representing the "application id"
+     */
+    public static final Property SQLITE_APPLICATION_ID =
+            Property.internalText(SQLITE3_PREFIX + "application_id");
+
+    /**
+     * Base16 encoded integer representing the "user version"
+     */
+    public static final Property SQLITE_USER_VERSION =
+            Property.internalText(SQLITE3_PREFIX + "user_version");
+
     /**
      * Serial version UID
      */
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
index 6b8e5007a..49dfcd0ba 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.sqlite3;
 
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
 import java.io.InputStream;
 
 import org.junit.jupiter.api.Test;
@@ -46,6 +48,8 @@ public class SQLite3ParserTest extends TikaTest {
         String s = handler.toString();
         TikaTest.assertContains("0\t2.3\t2.4\tlorem", s);
         TikaTest.assertContains("tempor\n", s);
+        assertEquals("0", metadata.get(SQLite3Parser.SQLITE_APPLICATION_ID));
+        assertEquals("0", metadata.get(SQLite3Parser.SQLITE_USER_VERSION));
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
index c2af83e63..bd469f134 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
@@ -78,6 +78,9 @@ public abstract class AbstractDBParser implements Parser {
             //add table names to parent metadata
             metadata.add(Database.TABLE_NAME, tableName);
         }
+
+        extractMetadata(connection, metadata);
+
         xHandler = new XHTMLContentHandler(handler, metadata);
         xHandler.startDocument();
 
@@ -114,6 +117,17 @@ public abstract class AbstractDBParser implements Parser {
         }
     }
 
+    /**
+     * This is called before parsing the tables to extract metadata from the db, if any.
+     * Override this for db specific metadata. This implementation is a no-op
+     *
+     * @param connection
+     * @param metadata
+     */
+    protected void extractMetadata(Connection connection, Metadata metadata) {
+
+    }
+
     /**
      * Override this for any special handling of closing the connection.
      *