You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/30 17:23:21 UTC
(tika) 01/01: TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4187
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 73694d21ab19e7e1134ee4f2bf8b76e8c35387bf
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jan 30 12:22:59 2024 -0500
TIKA-4187 -- improve detection of sqlite3 based files and add metadata extraction
---
CHANGES.txt | 2 +
.../org/apache/tika/mime/tika-mimetypes.xml | 105 ++++++++++++++++++++-
.../tika/parser/sqlite3/SQLite3DBParser.java | 32 +++++++
.../apache/tika/parser/sqlite3/SQLite3Parser.java | 15 +++
.../tika/parser/sqlite3/SQLite3ParserTest.java | 4 +
.../apache/tika/parser/jdbc/AbstractDBParser.java | 14 +++
6 files changed, 171 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index f9ac540e6..163753e9b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,8 @@ Release 3.0.0-BETA - 12/01/2023
Other Changes/Updates
+ * Improve detection of sqlite3-based file formats (TIKA-4187).
+
* Upgrade PDFBox to 3.0.1 (TIKA-3347)
* Deprecated AbstractParser for removal in 4.x (TIKA-4132).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 54f7cc6f6..2930fa720 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4858,11 +4858,114 @@
</mime-type>
<mime-type type="application/x-sqlite3">
+ <!--sqlite subtypes are from: https://www.sqlite.org/src/artifact?ci=trunk&filename=magic.txt -->
<magic priority="50">
<match value="SQLite format 3\x00" type="string" offset="0"/>
</magic>
</mime-type>
-
+ <!-- geo package >= 1.2.x -->
+ <mime-type type="application/x-geopackage">
+ <url>https://www.geopackage.org/</url>
+ <!-- should be higher than application/x-sqlite3 -->
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x47504b47" type="string" offset="68"/>
+ </match>
+ </magic>
+ <glob pattern="*.gpkg"/>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <!-- geo package 1.0.x or 1.1.x -->
+ <mime-type type="application/x-geopackage; version=1.1Or1.0">
+ <url>https://www.geopackage.org/</url>
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x47503130" type="string" offset="68"/>
+ </match>
+ </magic>
+ <glob pattern="*.gpkg"/>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-fossil-checkout">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x0f055112" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-fossil-checkout">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x0f055112" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-fossil-global-conf">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x0f055113" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-fossil-repository">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x0f055111" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-bentley-besqlite">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x42654462" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-bentley-localization">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x42654c6e" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-monotone-source-repo">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x5f4d544e" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-esri-spatially-enabled-db">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x45737269" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-mbtiles">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x4d504258" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
+ <mime-type type="application/x-texnicard">
+ <magic priority="60">
+ <match value="SQLite format 3\x00" type="string" offset="0">
+ <match value="0x6a035744" type="string" offset="68"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-sqlite3"/>
+ </mime-type>
<mime-type type="application/x-stata-do">
<_comment>Stata DTA Script</_comment>
<acronym>DO</acronym>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index f4c9d745e..947272a0a 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -27,6 +27,7 @@ import java.sql.SQLException;
import java.sql.Statement;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.sqlite.SQLiteConfig;
@@ -34,6 +35,7 @@ import org.sqlite.SQLiteConfig;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.jdbc.AbstractDBParser;
@@ -48,6 +50,12 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
class SQLite3DBParser extends AbstractDBParser {
protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
+
+ protected static final Map<Property, String> METADATA_KEYS = Map.of(
+ SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from pragma_application_id",
+ SQLite3Parser.SQLITE_USER_VERSION, "select user_version from pragma_user_version"
+ );
+
//If the InputStream wasn't a TikaInputStream, copy to this tmp file
Path tmpFile = null;
@@ -144,4 +152,28 @@ class SQLite3DBParser extends AbstractDBParser {
EmbeddedDocumentUtil embeddedDocumentUtil) {
return new SQLite3TableReader(connection, tableName, embeddedDocumentUtil);
}
+
+ @Override
+ protected void extractMetadata(Connection connection, Metadata metadata) {
+ //TODO -- figure out how to get the version of sqlite3 that last modified this file and
+ // version-valid-for.
+ // version-valid-for is at offset 92, last modified by app version isat offset 96 --
+ // not clear how to get this info via sql
+ //'file' extracts this info; we should to :\
+ //See: https://www.sqlite.org/fileformat.html
+ for (Map.Entry<Property, String> e : METADATA_KEYS.entrySet()) {
+ try (Statement st = connection.createStatement()) {
+ try (ResultSet rs = st.executeQuery(e.getValue())) {
+ if (rs.next()) {
+ int val = rs.getInt(1);
+ if (! rs.wasNull()) {
+ metadata.set(e.getKey(), Integer.toString(val, 16));
+ }
+ }
+ }
+ } catch (SQLException ex) {
+ //swallow
+ }
+ }
+ }
}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
index 1bd18916e..34aab4d10 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
@@ -32,6 +32,7 @@ import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -54,6 +55,20 @@ import org.apache.tika.parser.Parser;
*/
public class SQLite3Parser implements Parser, Initializable {
+ public static final String SQLITE3_PREFIX = "sqlite3:";
+
+ /**
+ * Base16 encoded integer representing the "application id"
+ */
+ public static final Property SQLITE_APPLICATION_ID =
+ Property.internalText(SQLITE3_PREFIX + "application_id");
+
+ /**
+ * Base16 encoded integer representing the "user version"
+ */
+ public static final Property SQLITE_USER_VERSION =
+ Property.internalText(SQLITE3_PREFIX + "user_version");
+
/**
* Serial version UID
*/
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
index 6b8e5007a..49dfcd0ba 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
@@ -17,6 +17,8 @@
package org.apache.tika.parser.sqlite3;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
import java.io.InputStream;
import org.junit.jupiter.api.Test;
@@ -46,6 +48,8 @@ public class SQLite3ParserTest extends TikaTest {
String s = handler.toString();
TikaTest.assertContains("0\t2.3\t2.4\tlorem", s);
TikaTest.assertContains("tempor\n", s);
+ assertEquals("0", metadata.get(SQLite3Parser.SQLITE_APPLICATION_ID));
+ assertEquals("0", metadata.get(SQLite3Parser.SQLITE_USER_VERSION));
}
@Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
index c2af83e63..bd469f134 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
@@ -78,6 +78,9 @@ public abstract class AbstractDBParser implements Parser {
//add table names to parent metadata
metadata.add(Database.TABLE_NAME, tableName);
}
+
+ extractMetadata(connection, metadata);
+
xHandler = new XHTMLContentHandler(handler, metadata);
xHandler.startDocument();
@@ -114,6 +117,17 @@ public abstract class AbstractDBParser implements Parser {
}
}
+ /**
+ * This is called before parsing the tables to extract metadata from the db, if any.
+ * Override this for db specific metadata. This implementation is a no-op
+ *
+ * @param connection
+ * @param metadata
+ */
+ protected void extractMetadata(Connection connection, Metadata metadata) {
+
+ }
+
/**
* Override this for any special handling of closing the connection.
*