You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 16:01:27 UTC
(tika) branch main updated: TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4c3625fb4 TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
4c3625fb4 is described below
commit 4c3625fb4599980885063781aeefe441379b5c2c
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Feb 20 11:01:21 2024 -0500
TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
* TIKA-4198 -- add parser for geopkg
---
tika-parent/pom.xml | 6 +-
.../apache/tika/parser/geopkg/GeoPkgDBParser.java | 54 +++++++++
.../apache/tika/parser/geopkg/GeoPkgParser.java | 127 +++++++++++++++++++++
.../GeoPkgTableReader.java} | 59 ++++------
.../tika/parser/sqlite3/SQLite3DBParser.java | 2 +-
.../tika/parser/sqlite3/SQLite3TableReader.java | 2 +-
.../services/org.apache.tika.parser.Parser | 1 +
7 files changed, 212 insertions(+), 39 deletions(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index bf116f50a..47116650a 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -1101,9 +1101,9 @@
natural language process module. Serialization is only on data that is configured in
tika-config.xml. We don't think we'd be vulnerable to crafted user input. -->
<coordinate>
- <groupId>org.apache.uima</groupId>
- <artifactId>uimaj-core</artifactId>
- <version>3.4.1</version>
+ <groupId>org.apache.uima</groupId>
+ <artifactId>uimaj-core</artifactId>
+ <version>3.4.1</version>
</coordinate>
</excludeCoordinates>
<fail>true</fail>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
new file mode 100644
index 000000000..d4b56127d
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+import java.sql.Connection;
+import java.util.Set;
+
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3DBParser;
+
+/**
+ * This is the implementation of the db parser for SQLite.
+ * <p/>
+ * This parser is internal only; it should not be registered in the services
+ * file or configured in the TikaConfig xml file.
+ */
+class GeoPkgDBParser extends SQLite3DBParser {
+
+ private final Set<String> ignoreBlobColumns;
+
+ GeoPkgDBParser(Set<String> ignoreBlobColumns) {
+ this.ignoreBlobColumns = ignoreBlobColumns;
+ }
+
+ @Override
+ public JDBCTableReader getTableReader(Connection connection, String tableName,
+ ParseContext context) {
+ return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context),
+ ignoreBlobColumns);
+ }
+
+ @Override
+ protected JDBCTableReader getTableReader(Connection connection, String tableName,
+ EmbeddedDocumentUtil embeddedDocumentUtil) {
+ return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil,
+ ignoreBlobColumns);
+ }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
new file mode 100644
index 000000000..e157a09c9
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.sqlite3.SQLite3Parser;
+
+/**
+ * Customization of sqlite parser to skip certain common blob columns.
+ * <p>
+ * The motivation is that "geom" and "data" columns are intrinsic to geopkg
+ * and are not regular embedded files. Tika treats all blob columns as, potentially,
+ * embedded files -- this can add dramatically to the time to parse geopkg
+ * files, which might have hundreds of thousands of uninteresting blobs.
+ * <p>
+ * Users may modify which columns are ignored or turn off "ignoring"
+ * of all solumns.
+ * <p>
+ * To add a column to the default "ignore blob columns" via tika-config.xml:
+ * <pre>{@code}
+ * <parsers>
+ * <parser class="org.apache.tika.parser.DefaultParser"/>
+ * <parser class="org.apache.tika.parser.geopkg.GeoPkgParser">
+ * <param name="ignoreBlobColumns" type="list">
+ * <string>geom</string>
+ * <string>data</string>
+ * <string>something</string>
+ * </param>
+ * </parser>
+ * </parsers>
+ * }</pre>
+ * <p>
+ * Or use an empty list to parse all columns.
+ */
+public class GeoPkgParser extends SQLite3Parser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -752276948656079347L;
+
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-geopackage");
+
+ private static final Set<MediaType> SUPPORTED_TYPES;
+
+
+ static {
+ SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+ }
+
+ private static final Set<String> DEFAULT_IGNORE_BLOB_COLUMNS = Set.of("geom", "data");
+ private Set<String> ignoreBlobColumns = new HashSet<>(DEFAULT_IGNORE_BLOB_COLUMNS);
+ /**
+ * Checks to see if class is available for org.sqlite.JDBC.
+ * <p/>
+ * If not, this class will return an EMPTY_SET for getSupportedTypes()
+ */
+ public GeoPkgParser() {
+
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ GeoPkgDBParser p = new GeoPkgDBParser(ignoreBlobColumns);
+ p.parse(stream, handler, metadata, context);
+ }
+
+ @Field
+ public void setIgnoreBlobColumns(List<String> ignoreBlobColumns) {
+ this.ignoreBlobColumns.clear();
+ this.ignoreBlobColumns.addAll(ignoreBlobColumns);
+ }
+ /**
+ * No-op
+ *
+ * @param params params to use for initialization
+ * @throws TikaConfigException
+ */
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
similarity index 51%
copy from tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
copy to tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
index 8ddf079d3..48256c2a5 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
@@ -14,68 +14,59 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.sqlite3;
+package org.apache.tika.parser.geopkg;
import java.io.IOException;
-import java.sql.Blob;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
-import javax.sql.rowset.serial.SerialBlob;
+import java.util.Set;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3TableReader;
/**
- * Concrete class for SQLLite table parsing. This overrides
- * column type handling from JDBCRowHandler.
+ * Concrete class for GeoPkg parsing. This overrides blob handling to skip "geom" and "data"
+ * columns
* <p/>
* For now, this silently skips cells of type CLOB, because xerial's jdbc connector
* does not currently support them.
*/
-class SQLite3TableReader extends JDBCTableReader {
+class GeoPkgTableReader extends SQLite3TableReader {
+ private final Set<String> ignoreBlobColumns;
- public SQLite3TableReader(Connection connection, String tableName,
- EmbeddedDocumentUtil embeddedDocumentUtil) {
+ public GeoPkgTableReader(Connection connection, String tableName,
+ EmbeddedDocumentUtil embeddedDocumentUtil, Set<String> ignoreBlobColumns) {
super(connection, tableName, embeddedDocumentUtil);
+ this.ignoreBlobColumns = ignoreBlobColumns;
}
- /**
- * No-op for now in {@link SQLite3TableReader}.
- *
- * @param tableName
- * @param fieldName
- * @param rowNum
- * @param resultSet
- * @param columnIndex
- * @param handler
- * @param context
- * @throws java.sql.SQLException
- * @throws java.io.IOException
- * @throws org.xml.sax.SAXException
- */
+
@Override
- protected void handleClob(String tableName, String fieldName, int rowNum, ResultSet resultSet,
+ protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet,
int columnIndex, ContentHandler handler, ParseContext context)
throws SQLException, IOException, SAXException {
- //no-op for now.
- }
-
- @Override
- protected Blob getBlob(ResultSet resultSet, int columnIndex, Metadata m) throws SQLException {
- byte[] bytes = resultSet.getBytes(columnIndex);
- if (!resultSet.wasNull()) {
- return new SerialBlob(bytes);
+ if (ignoreBlobColumns.contains(columnName)) {
+ Attributes attrs = new AttributesImpl();
+ ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
+ ((AttributesImpl) attrs)
+ .addAttribute("", "column_name", "column_name", "CDATA", columnName);
+ ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA",
+ Integer.toString(rowNum));
+ handler.startElement("", "span", "span", attrs);
+ handler.endElement("", "span", "span");
+ return;
}
- return null;
+ super.handleBlob(tableName, columnName, rowNum, resultSet, columnIndex, handler, context);
}
}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index 947272a0a..fd8c2e8a4 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -47,7 +47,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
* This parser is internal only; it should not be registered in the services
* file or configured in the TikaConfig xml file.
*/
-class SQLite3DBParser extends AbstractDBParser {
+public class SQLite3DBParser extends AbstractDBParser {
protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
index 8ddf079d3..e0b5f0b27 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
@@ -40,7 +40,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
* For now, this silently skips cells of type CLOB, because xerial's jdbc connector
* does not currently support them.
*/
-class SQLite3TableReader extends JDBCTableReader {
+public class SQLite3TableReader extends JDBCTableReader {
public SQLite3TableReader(Connection connection, String tableName,
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 790f868cc..14509c812 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -13,4 +13,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.tika.parser.geopkg.GeoPkgParser
org.apache.tika.parser.sqlite3.SQLite3Parser