You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 11:49:44 UTC

(tika) branch TIKA-4198 updated: TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4198
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4198 by this push:
     new 7fad80367 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns
7fad80367 is described below

commit 7fad803673b1ae82ba4ff74aad1a9d12e356224d
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 20 06:49:29 2024 -0500

    TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns
---
 .../apache/tika/parser/geopkg/GeoPkgDBParser.java  | 30 ++++++++--------------
 .../apache/tika/parser/geopkg/GeoPkgParser.java    | 16 +++++++++---
 .../tika/parser/geopkg/GeoPkgTableReader.java      | 14 +++++-----
 3 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
index 5dc0f9ff2..d4b56127d 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
@@ -16,29 +16,11 @@
  */
 package org.apache.tika.parser.geopkg;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
 import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
 import java.util.Set;
 
-import org.sqlite.SQLiteConfig;
-
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.jdbc.AbstractDBParser;
 import org.apache.tika.parser.jdbc.JDBCTableReader;
 import org.apache.tika.parser.sqlite3.SQLite3DBParser;
 
@@ -50,15 +32,23 @@ import org.apache.tika.parser.sqlite3.SQLite3DBParser;
  */
 class GeoPkgDBParser extends SQLite3DBParser {
 
+    private final Set<String> ignoreBlobColumns;
+
+    GeoPkgDBParser(Set<String> ignoreBlobColumns) {
+        this.ignoreBlobColumns = ignoreBlobColumns;
+    }
+
     @Override
     public JDBCTableReader getTableReader(Connection connection, String tableName,
                                           ParseContext context) {
-        return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context));
+        return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context),
+                ignoreBlobColumns);
     }
 
     @Override
     protected JDBCTableReader getTableReader(Connection connection, String tableName,
                                              EmbeddedDocumentUtil embeddedDocumentUtil) {
-        return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil);
+        return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil,
+                ignoreBlobColumns);
     }
 }
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
index 6aae7cb04..907e6de39 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
@@ -20,22 +20,22 @@ package org.apache.tika.parser.geopkg;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import org.apache.tika.config.Initializable;
+import org.apache.tika.config.Field;
 import org.apache.tika.config.InitializableProblemHandler;
 import org.apache.tika.config.Param;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.sqlite3.SQLite3Parser;
 
 /**
@@ -52,10 +52,13 @@ public class GeoPkgParser extends SQLite3Parser {
 
     private static final Set<MediaType> SUPPORTED_TYPES;
 
+
     static {
         SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
     }
 
+    private static final Set<String> DEFAULT_IGNORE_BLOB_COLUMNS = Set.of("geom", "data");
+    private Set<String> ignoreBlobColumns = new HashSet<>(DEFAULT_IGNORE_BLOB_COLUMNS);
     /**
      * Checks to see if class is available for org.sqlite.JDBC.
      * <p/>
@@ -73,10 +76,15 @@ public class GeoPkgParser extends SQLite3Parser {
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                       ParseContext context) throws IOException, SAXException, TikaException {
-        GeoPkgDBParser p = new GeoPkgDBParser();
+        GeoPkgDBParser p = new GeoPkgDBParser(ignoreBlobColumns);
         p.parse(stream, handler, metadata, context);
     }
 
+    @Field
+    public void setIgnoreBlobColumns(List<String> ignoreBlobColumns) {
+        this.ignoreBlobColumns.clear();
+        this.ignoreBlobColumns.addAll(ignoreBlobColumns);
+    }
     /**
      * No-op
      *
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
index e9b093565..48256c2a5 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
@@ -18,11 +18,10 @@ package org.apache.tika.parser.geopkg;
 
 
 import java.io.IOException;
-import java.sql.Blob;
 import java.sql.Connection;
 import java.sql.ResultSet;
 import java.sql.SQLException;
-import javax.sql.rowset.serial.SerialBlob;
+import java.util.Set;
 
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -30,9 +29,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.jdbc.JDBCTableReader;
 import org.apache.tika.parser.sqlite3.SQLite3TableReader;
 
 
@@ -45,11 +42,12 @@ import org.apache.tika.parser.sqlite3.SQLite3TableReader;
  */
 class GeoPkgTableReader extends SQLite3TableReader {
 
-    private static final String GEOM = "geom";
-    private static final String DATA = "data";
+    private final Set<String> ignoreBlobColumns;
+
     public GeoPkgTableReader(Connection connection, String tableName,
-                             EmbeddedDocumentUtil embeddedDocumentUtil) {
+                             EmbeddedDocumentUtil embeddedDocumentUtil, Set<String> ignoreBlobColumns) {
         super(connection, tableName, embeddedDocumentUtil);
+        this.ignoreBlobColumns = ignoreBlobColumns;
     }
 
 
@@ -58,7 +56,7 @@ class GeoPkgTableReader extends SQLite3TableReader {
     protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet,
                               int columnIndex, ContentHandler handler, ParseContext context)
             throws SQLException, IOException, SAXException {
-        if (GEOM.equals(columnName) || DATA.equals(columnName)) {
+        if (ignoreBlobColumns.contains(columnName)) {
             Attributes attrs = new AttributesImpl();
             ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
             ((AttributesImpl) attrs)