You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/30 17:23:20 UTC

(tika) branch TIKA-4187 created (now 73694d21a)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4187
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction

This branch includes the following new commits:

     new 73694d21a TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4187
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 73694d21ab19e7e1134ee4f2bf8b76e8c35387bf
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jan 30 12:22:59 2024 -0500

    TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/mime/tika-mimetypes.xml        | 105 ++++++++++++++++++++-
 .../tika/parser/sqlite3/SQLite3DBParser.java       |  32 +++++++
 .../apache/tika/parser/sqlite3/SQLite3Parser.java  |  15 +++
 .../tika/parser/sqlite3/SQLite3ParserTest.java     |   4 +
 .../apache/tika/parser/jdbc/AbstractDBParser.java  |  14 +++
 6 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index f9ac540e6..163753e9b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023
 
    Other Changes/Updates
 
+   * Improve detection of sqlite3-based file formats (TIKA-4187).
+
    * Upgrade PDFBox to 3.0.1 (TIKA-3347)
    
    * Deprecated AbstractParser for removal in 4.x (TIKA-4132).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 54f7cc6f6..2930fa720 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4858,11 +4858,114 @@
   </mime-type>
 
   <mime-type type="application/x-sqlite3">
+    <!--sqlite subtypes are from: https://www.sqlite.org/src/artifact?ci=trunk&filename=magic.txt -->
     <magic priority="50">
       <match value="SQLite format 3\x00" type="string" offset="0"/>
     </magic>
   </mime-type>
-
+  <!-- geo package >= 1.2.x -->
+  <mime-type type="application/x-geopackage">
+    <url>https://www.geopackage.org/</url>
+    <!-- should be higher than application/x-sqlite3 -->
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x47504b47" type="string" offset="68"/>
+      </match>
+    </magic>
+    <glob pattern="*.gpkg"/>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <!-- geo package 1.0.x or 1.1.x -->
+  <mime-type type="application/x-geopackage; version=1.1Or1.0">
+    <url>https://www.geopackage.org/</url>
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x47503130" type="string" offset="68"/>
+      </match>
+    </magic>
+    <glob pattern="*.gpkg"/>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-checkout">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055112" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-checkout">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055112" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-global-conf">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055113" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-fossil-repository">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x0f055111" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-bentley-besqlite">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x42654462" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-bentley-localization">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x42654c6e" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-monotone-source-repo">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x5f4d544e" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-esri-spatially-enabled-db">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x45737269" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-mbtiles">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x4d504258" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
+  <mime-type type="application/x-texnicard">
+    <magic priority="60">
+      <match value="SQLite format 3\x00" type="string" offset="0">
+        <match value="0x6a035744" type="string" offset="68"/>
+      </match>
+    </magic>
+    <sub-class-of type="application/x-sqlite3"/>
+  </mime-type>
   <mime-type type="application/x-stata-do">
     <_comment>Stata DTA Script</_comment>
     <acronym>DO</acronym>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index f4c9d745e..947272a0a 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -27,6 +27,7 @@ import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.sqlite.SQLiteConfig;
@@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.jdbc.AbstractDBParser;
@@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
 class SQLite3DBParser extends AbstractDBParser {
 
     protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+    protected static final Map<Property, String> METADATA_KEYS = Map.of(
+            SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from pragma_application_id",
+            SQLite3Parser.SQLITE_USER_VERSION, "select user_version from pragma_user_version"
+    );
+
     //If the InputStream wasn't a TikaInputStream, copy to this tmp file
     Path tmpFile = null;
 
@@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser {
                                              EmbeddedDocumentUtil embeddedDocumentUtil) {
         return new SQLite3TableReader(connection, tableName, embeddedDocumentUtil);
     }
+
+    @Override
+    protected void extractMetadata(Connection connection, Metadata metadata) {
+        //TODO -- figure out how to get the version of sqlite3 that last modified this file and
+        // version-valid-for.
+        // version-valid-for is at offset 92, last modified by app version isat offset 96 --
+        // not clear how to get this info via sql
+        //'file' extracts this info; we should to :\
+        //See: https://www.sqlite.org/fileformat.html
+        for (Map.Entry<Property, String> e : METADATA_KEYS.entrySet()) {
+            try (Statement st = connection.createStatement()) {
+                try (ResultSet rs = st.executeQuery(e.getValue())) {
+                    if (rs.next()) {
+                        int val = rs.getInt(1);
+                        if (! rs.wasNull()) {
+                            metadata.set(e.getKey(), Integer.toString(val, 16));
+                        }
+                    }
+                }
+            } catch (SQLException ex) {
+                //swallow
+            }
+        }
+    }
 }
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
index 1bd18916e..34aab4d10 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
@@ -32,6 +32,7 @@ import org.apache.tika.config.Param;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -54,6 +55,20 @@ import org.apache.tika.parser.Parser;
  */
 public class SQLite3Parser implements Parser, Initializable {
 
+    public static final String SQLITE3_PREFIX = "sqlite3:";
+
+    /**
+     * Base16 encoded integer representing the "application id"
+     */
+    public static final Property SQLITE_APPLICATION_ID =
+            Property.internalText(SQLITE3_PREFIX + "application_id");
+
+    /**
+     * Base16 encoded integer representing the "user version"
+     */
+    public static final Property SQLITE_USER_VERSION =
+            Property.internalText(SQLITE3_PREFIX + "user_version");
+
     /**
      * Serial version UID
      */
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
index 6b8e5007a..49dfcd0ba 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.sqlite3;
 
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
 import java.io.InputStream;
 
 import org.junit.jupiter.api.Test;
@@ -46,6 +48,8 @@ public class SQLite3ParserTest extends TikaTest {
         String s = handler.toString();
         TikaTest.assertContains("0\t2.3\t2.4\tlorem", s);
         TikaTest.assertContains("tempor\n", s);
+        assertEquals("0", metadata.get(SQLite3Parser.SQLITE_APPLICATION_ID));
+        assertEquals("0", metadata.get(SQLite3Parser.SQLITE_USER_VERSION));
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
index c2af83e63..bd469f134 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
@@ -78,6 +78,9 @@ public abstract class AbstractDBParser implements Parser {
             //add table names to parent metadata
             metadata.add(Database.TABLE_NAME, tableName);
         }
+
+        extractMetadata(connection, metadata);
+
         xHandler = new XHTMLContentHandler(handler, metadata);
         xHandler.startDocument();
 
@@ -114,6 +117,17 @@ public abstract class AbstractDBParser implements Parser {
         }
     }
 
+    /**
+     * This is called before parsing the tables to extract metadata from the db, if any.
+     * Override this for db specific metadata. This implementation is a no-op
+     *
+     * @param connection
+     * @param metadata
+     */
+    protected void extractMetadata(Connection connection, Metadata metadata) {
+
+    }
+
     /**
      * Override this for any special handling of closing the connection.
      *