You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 16:01:27 UTC

(tika) branch main updated: TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4c3625fb4 TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
4c3625fb4 is described below

commit 4c3625fb4599980885063781aeefe441379b5c2c
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Feb 20 11:01:21 2024 -0500

    TIKA-4198 -- create separate geopkg parser to skip some blob columns (#1607)
    
    * TIKA-4198 -- add parser for geopkg
---
 tika-parent/pom.xml                                |   6 +-
 .../apache/tika/parser/geopkg/GeoPkgDBParser.java  |  54 +++++++++
 .../apache/tika/parser/geopkg/GeoPkgParser.java    | 127 +++++++++++++++++++++
 .../GeoPkgTableReader.java}                        |  59 ++++------
 .../tika/parser/sqlite3/SQLite3DBParser.java       |   2 +-
 .../tika/parser/sqlite3/SQLite3TableReader.java    |   2 +-
 .../services/org.apache.tika.parser.Parser         |   1 +
 7 files changed, 212 insertions(+), 39 deletions(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index bf116f50a..47116650a 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -1101,9 +1101,9 @@
             natural language process module. Serialization is only on data that is configured in
             tika-config.xml. We don't think we'd be vulnerable to crafted user input. -->
             <coordinate>
-                <groupId>org.apache.uima</groupId>
-                <artifactId>uimaj-core</artifactId>
-                <version>3.4.1</version>
+              <groupId>org.apache.uima</groupId>
+              <artifactId>uimaj-core</artifactId>
+              <version>3.4.1</version>
             </coordinate>
           </excludeCoordinates>
           <fail>true</fail>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
new file mode 100644
index 000000000..d4b56127d
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+import java.sql.Connection;
+import java.util.Set;
+
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3DBParser;
+
+/**
+ * This is the implementation of the db parser for SQLite.
+ * <p/>
+ * This parser is internal only; it should not be registered in the services
+ * file or configured in the TikaConfig xml file.
+ */
+class GeoPkgDBParser extends SQLite3DBParser {
+
+    private final Set<String> ignoreBlobColumns;
+
+    GeoPkgDBParser(Set<String> ignoreBlobColumns) {
+        this.ignoreBlobColumns = ignoreBlobColumns;
+    }
+
+    @Override
+    public JDBCTableReader getTableReader(Connection connection, String tableName,
+                                          ParseContext context) {
+        return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context),
+                ignoreBlobColumns);
+    }
+
+    @Override
+    protected JDBCTableReader getTableReader(Connection connection, String tableName,
+                                             EmbeddedDocumentUtil embeddedDocumentUtil) {
+        return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil,
+                ignoreBlobColumns);
+    }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
new file mode 100644
index 000000000..e157a09c9
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.sqlite3.SQLite3Parser;
+
+/**
+ * Customization of sqlite parser to skip certain common blob columns.
+ * <p>
+ * The motivation is that "geom" and "data" columns are intrinsic to geopkg
+ * and are not regular embedded files. Tika treats all blob columns as, potentially,
+ * embedded files -- this can add dramatically to the time to parse geopkg
+ * files, which might have hundreds of thousands of uninteresting blobs.
+ * <p>
+ * Users may modify which columns are ignored or turn off "ignoring"
+ * of all solumns.
+ * <p>
+ * To add a column to the default "ignore blob columns" via tika-config.xml:
+ *  <pre>{@code}
+ *   <parsers>
+ *     <parser class="org.apache.tika.parser.DefaultParser"/>
+ *     <parser class="org.apache.tika.parser.geopkg.GeoPkgParser">
+ *       <param name="ignoreBlobColumns" type="list">
+ *         <string>geom</string>
+ *         <string>data</string>
+ *         <string>something</string>
+ *       </param>
+ *     </parser>
+ *   </parsers>
+ *   }</pre>
+ * <p>
+ *   Or use an empty list to parse all columns.
+ */
+public class GeoPkgParser extends SQLite3Parser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-geopackage");
+
+    private static final Set<MediaType> SUPPORTED_TYPES;
+
+
+    static {
+        SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+    }
+
+    private static final Set<String> DEFAULT_IGNORE_BLOB_COLUMNS = Set.of("geom", "data");
+    private Set<String> ignoreBlobColumns = new HashSet<>(DEFAULT_IGNORE_BLOB_COLUMNS);
+    /**
+     * Checks to see if class is available for org.sqlite.JDBC.
+     * <p/>
+     * If not, this class will return an EMPTY_SET for  getSupportedTypes()
+     */
+    public GeoPkgParser() {
+
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+        GeoPkgDBParser p = new GeoPkgDBParser(ignoreBlobColumns);
+        p.parse(stream, handler, metadata, context);
+    }
+
+    @Field
+    public void setIgnoreBlobColumns(List<String> ignoreBlobColumns) {
+        this.ignoreBlobColumns.clear();
+        this.ignoreBlobColumns.addAll(ignoreBlobColumns);
+    }
+    /**
+     * No-op
+     *
+     * @param params params to use for initialization
+     * @throws TikaConfigException
+     */
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+    }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
similarity index 51%
copy from tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
copy to tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
index 8ddf079d3..48256c2a5 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
@@ -14,68 +14,59 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.sqlite3;
+package org.apache.tika.parser.geopkg;
 
 
 import java.io.IOException;
-import java.sql.Blob;
 import java.sql.Connection;
 import java.sql.ResultSet;
 import java.sql.SQLException;
-import javax.sql.rowset.serial.SerialBlob;
+import java.util.Set;
 
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3TableReader;
 
 
 /**
- * Concrete class for SQLLite table parsing.  This overrides
- * column type handling from JDBCRowHandler.
+ * Concrete class for GeoPkg parsing.  This overrides blob handling to skip "geom" and "data"
+ * columns
  * <p/>
  * For now, this silently skips cells of type CLOB, because xerial's jdbc connector
  * does not currently support them.
  */
-class SQLite3TableReader extends JDBCTableReader {
+class GeoPkgTableReader extends SQLite3TableReader {
 
+    private final Set<String> ignoreBlobColumns;
 
-    public SQLite3TableReader(Connection connection, String tableName,
-                              EmbeddedDocumentUtil embeddedDocumentUtil) {
+    public GeoPkgTableReader(Connection connection, String tableName,
+                             EmbeddedDocumentUtil embeddedDocumentUtil, Set<String> ignoreBlobColumns) {
         super(connection, tableName, embeddedDocumentUtil);
+        this.ignoreBlobColumns = ignoreBlobColumns;
     }
 
 
-    /**
-     * No-op for now in {@link SQLite3TableReader}.
-     *
-     * @param tableName
-     * @param fieldName
-     * @param rowNum
-     * @param resultSet
-     * @param columnIndex
-     * @param handler
-     * @param context
-     * @throws java.sql.SQLException
-     * @throws java.io.IOException
-     * @throws org.xml.sax.SAXException
-     */
+
     @Override
-    protected void handleClob(String tableName, String fieldName, int rowNum, ResultSet resultSet,
+    protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet,
                               int columnIndex, ContentHandler handler, ParseContext context)
             throws SQLException, IOException, SAXException {
-        //no-op for now.
-    }
-
-    @Override
-    protected Blob getBlob(ResultSet resultSet, int columnIndex, Metadata m) throws SQLException {
-        byte[] bytes = resultSet.getBytes(columnIndex);
-        if (!resultSet.wasNull()) {
-            return new SerialBlob(bytes);
+        if (ignoreBlobColumns.contains(columnName)) {
+            Attributes attrs = new AttributesImpl();
+            ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
+            ((AttributesImpl) attrs)
+                    .addAttribute("", "column_name", "column_name", "CDATA", columnName);
+            ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA",
+                    Integer.toString(rowNum));
+            handler.startElement("", "span", "span", attrs);
+            handler.endElement("", "span", "span");
+            return;
         }
-        return null;
+        super.handleBlob(tableName, columnName, rowNum, resultSet, columnIndex, handler, context);
     }
 }
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index 947272a0a..fd8c2e8a4 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -47,7 +47,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
  * This parser is internal only; it should not be registered in the services
  * file or configured in the TikaConfig xml file.
  */
-class SQLite3DBParser extends AbstractDBParser {
+public class SQLite3DBParser extends AbstractDBParser {
 
     protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
 
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
index 8ddf079d3..e0b5f0b27 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
@@ -40,7 +40,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
  * For now, this silently skips cells of type CLOB, because xerial's jdbc connector
  * does not currently support them.
  */
-class SQLite3TableReader extends JDBCTableReader {
+public class SQLite3TableReader extends JDBCTableReader {
 
 
     public SQLite3TableReader(Connection connection, String tableName,
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 790f868cc..14509c812 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -13,4 +13,5 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+org.apache.tika.parser.geopkg.GeoPkgParser
 org.apache.tika.parser.sqlite3.SQLite3Parser