You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 11:49:44 UTC
(tika) branch TIKA-4198 updated: TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4198
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4198 by this push:
new 7fad80367 TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns
7fad80367 is described below
commit 7fad803673b1ae82ba4ff74aad1a9d12e356224d
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 20 06:49:29 2024 -0500
TIKA-4198 -- add parser for geopkg allow for configuration of ignoreblobcolumns
---
.../apache/tika/parser/geopkg/GeoPkgDBParser.java | 30 ++++++++--------------
.../apache/tika/parser/geopkg/GeoPkgParser.java | 16 +++++++++---
.../tika/parser/geopkg/GeoPkgTableReader.java | 14 +++++-----
3 files changed, 28 insertions(+), 32 deletions(-)
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
index 5dc0f9ff2..d4b56127d 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
@@ -16,29 +16,11 @@
*/
package org.apache.tika.parser.geopkg;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
import java.util.Set;
-import org.sqlite.SQLiteConfig;
-
import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.jdbc.AbstractDBParser;
import org.apache.tika.parser.jdbc.JDBCTableReader;
import org.apache.tika.parser.sqlite3.SQLite3DBParser;
@@ -50,15 +32,23 @@ import org.apache.tika.parser.sqlite3.SQLite3DBParser;
*/
class GeoPkgDBParser extends SQLite3DBParser {
+ private final Set<String> ignoreBlobColumns;
+
+ GeoPkgDBParser(Set<String> ignoreBlobColumns) {
+ this.ignoreBlobColumns = ignoreBlobColumns;
+ }
+
@Override
public JDBCTableReader getTableReader(Connection connection, String tableName,
ParseContext context) {
- return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context));
+ return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context),
+ ignoreBlobColumns);
}
@Override
protected JDBCTableReader getTableReader(Connection connection, String tableName,
EmbeddedDocumentUtil embeddedDocumentUtil) {
- return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil);
+ return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil,
+ ignoreBlobColumns);
}
}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
index 6aae7cb04..907e6de39 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
@@ -20,22 +20,22 @@ package org.apache.tika.parser.geopkg;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.apache.tika.config.Initializable;
+import org.apache.tika.config.Field;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.parser.sqlite3.SQLite3Parser;
/**
@@ -52,10 +52,13 @@ public class GeoPkgParser extends SQLite3Parser {
private static final Set<MediaType> SUPPORTED_TYPES;
+
static {
SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
}
+ private static final Set<String> DEFAULT_IGNORE_BLOB_COLUMNS = Set.of("geom", "data");
+ private Set<String> ignoreBlobColumns = new HashSet<>(DEFAULT_IGNORE_BLOB_COLUMNS);
/**
* Checks to see if class is available for org.sqlite.JDBC.
* <p/>
@@ -73,10 +76,15 @@ public class GeoPkgParser extends SQLite3Parser {
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
- GeoPkgDBParser p = new GeoPkgDBParser();
+ GeoPkgDBParser p = new GeoPkgDBParser(ignoreBlobColumns);
p.parse(stream, handler, metadata, context);
}
+ @Field
+ public void setIgnoreBlobColumns(List<String> ignoreBlobColumns) {
+ this.ignoreBlobColumns.clear();
+ this.ignoreBlobColumns.addAll(ignoreBlobColumns);
+ }
/**
* No-op
*
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
index e9b093565..48256c2a5 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
@@ -18,11 +18,10 @@ package org.apache.tika.parser.geopkg;
import java.io.IOException;
-import java.sql.Blob;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
-import javax.sql.rowset.serial.SerialBlob;
+import java.util.Set;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -30,9 +29,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.jdbc.JDBCTableReader;
import org.apache.tika.parser.sqlite3.SQLite3TableReader;
@@ -45,11 +42,12 @@ import org.apache.tika.parser.sqlite3.SQLite3TableReader;
*/
class GeoPkgTableReader extends SQLite3TableReader {
- private static final String GEOM = "geom";
- private static final String DATA = "data";
+ private final Set<String> ignoreBlobColumns;
+
public GeoPkgTableReader(Connection connection, String tableName,
- EmbeddedDocumentUtil embeddedDocumentUtil) {
+ EmbeddedDocumentUtil embeddedDocumentUtil, Set<String> ignoreBlobColumns) {
super(connection, tableName, embeddedDocumentUtil);
+ this.ignoreBlobColumns = ignoreBlobColumns;
}
@@ -58,7 +56,7 @@ class GeoPkgTableReader extends SQLite3TableReader {
protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet,
int columnIndex, ContentHandler handler, ParseContext context)
throws SQLException, IOException, SAXException {
- if (GEOM.equals(columnName) || DATA.equals(columnName)) {
+ if (ignoreBlobColumns.contains(columnName)) {
Attributes attrs = new AttributesImpl();
((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
((AttributesImpl) attrs)