You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/16 14:51:13 UTC

(tika) branch TIKA-4198 created (now 32eb5aa95)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4198
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 32eb5aa95 TIKA-4198 -- add parser for geopkg

This branch includes the following new commits:

     new 32eb5aa95 TIKA-4198 -- add parser for geopkg

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4198 -- add parser for geopkg

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4198
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 32eb5aa9589f8365ba858aa6286e80bb8e54473c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 16 09:50:59 2024 -0500

    TIKA-4198 -- add parser for geopkg
---
 .../apache/tika/parser/geopkg/GeoPkgDBParser.java  | 64 +++++++++++++++
 .../apache/tika/parser/geopkg/GeoPkgParser.java    | 95 ++++++++++++++++++++++
 .../GeoPkgTableReader.java}                        | 55 ++++++-------
 .../tika/parser/sqlite3/SQLite3DBParser.java       |  2 +-
 .../tika/parser/sqlite3/SQLite3TableReader.java    |  2 +-
 .../services/org.apache.tika.parser.Parser         |  1 +
 6 files changed, 186 insertions(+), 33 deletions(-)

diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
new file mode 100644
index 000000000..5dc0f9ff2
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.sqlite.SQLiteConfig;
+
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.jdbc.AbstractDBParser;
+import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3DBParser;
+
+/**
+ * This is the implementation of the db parser for SQLite.
+ * <p/>
+ * This parser is internal only; it should not be registered in the services
+ * file or configured in the TikaConfig xml file.
+ */
+class GeoPkgDBParser extends SQLite3DBParser {
+
+    @Override
+    public JDBCTableReader getTableReader(Connection connection, String tableName,
+                                          ParseContext context) {
+        return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context));
+    }
+
+    @Override
+    protected JDBCTableReader getTableReader(Connection connection, String tableName,
+                                             EmbeddedDocumentUtil embeddedDocumentUtil) {
+        return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil);
+    }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
new file mode 100644
index 000000000..6aae7cb04
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.sqlite3.SQLite3Parser;
+
+/**
+ * customization of sqlite parser to skip certain common blob columns
+ */
+public class GeoPkgParser extends SQLite3Parser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-geopackage");
+
+    private static final Set<MediaType> SUPPORTED_TYPES;
+
+    static {
+        SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+    }
+
+    /**
+     * Checks to see if class is available for org.sqlite.JDBC.
+     * <p/>
+     * If not, this class will return an EMPTY_SET for  getSupportedTypes()
+     */
+    public GeoPkgParser() {
+
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+        GeoPkgDBParser p = new GeoPkgDBParser();
+        p.parse(stream, handler, metadata, context);
+    }
+
+    /**
+     * No-op
+     *
+     * @param params params to use for initialization
+     * @throws TikaConfigException
+     */
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+    }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
similarity index 54%
copy from tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
copy to tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
index 8ddf079d3..e9b093565 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.sqlite3;
+package org.apache.tika.parser.geopkg;
 
 
 import java.io.IOException;
@@ -24,58 +24,51 @@ import java.sql.ResultSet;
 import java.sql.SQLException;
 import javax.sql.rowset.serial.SerialBlob;
 
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3TableReader;
 
 
 /**
- * Concrete class for SQLLite table parsing.  This overrides
- * column type handling from JDBCRowHandler.
+ * Concrete class for GeoPkg parsing.  This overrides blob handling to skip "geom" and "data"
+ * columns
  * <p/>
  * For now, this silently skips cells of type CLOB, because xerial's jdbc connector
  * does not currently support them.
  */
-class SQLite3TableReader extends JDBCTableReader {
+class GeoPkgTableReader extends SQLite3TableReader {
 
-
-    public SQLite3TableReader(Connection connection, String tableName,
-                              EmbeddedDocumentUtil embeddedDocumentUtil) {
+    private static final String GEOM = "geom";
+    private static final String DATA = "data";
+    public GeoPkgTableReader(Connection connection, String tableName,
+                             EmbeddedDocumentUtil embeddedDocumentUtil) {
         super(connection, tableName, embeddedDocumentUtil);
     }
 
 
-    /**
-     * No-op for now in {@link SQLite3TableReader}.
-     *
-     * @param tableName
-     * @param fieldName
-     * @param rowNum
-     * @param resultSet
-     * @param columnIndex
-     * @param handler
-     * @param context
-     * @throws java.sql.SQLException
-     * @throws java.io.IOException
-     * @throws org.xml.sax.SAXException
-     */
+
     @Override
-    protected void handleClob(String tableName, String fieldName, int rowNum, ResultSet resultSet,
+    protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet,
                               int columnIndex, ContentHandler handler, ParseContext context)
             throws SQLException, IOException, SAXException {
-        //no-op for now.
-    }
-
-    @Override
-    protected Blob getBlob(ResultSet resultSet, int columnIndex, Metadata m) throws SQLException {
-        byte[] bytes = resultSet.getBytes(columnIndex);
-        if (!resultSet.wasNull()) {
-            return new SerialBlob(bytes);
+        if (GEOM.equals(columnName) || DATA.equals(columnName)) {
+            Attributes attrs = new AttributesImpl();
+            ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
+            ((AttributesImpl) attrs)
+                    .addAttribute("", "column_name", "column_name", "CDATA", columnName);
+            ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA",
+                    Integer.toString(rowNum));
+            handler.startElement("", "span", "span", attrs);
+            handler.endElement("", "span", "span");
+            return;
         }
-        return null;
+        super.handleBlob(tableName, columnName, rowNum, resultSet, columnIndex, handler, context);
     }
 }
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index 947272a0a..fd8c2e8a4 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -47,7 +47,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
  * This parser is internal only; it should not be registered in the services
  * file or configured in the TikaConfig xml file.
  */
-class SQLite3DBParser extends AbstractDBParser {
+public class SQLite3DBParser extends AbstractDBParser {
 
     protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
 
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
index 8ddf079d3..e0b5f0b27 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
@@ -40,7 +40,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
  * For now, this silently skips cells of type CLOB, because xerial's jdbc connector
  * does not currently support them.
  */
-class SQLite3TableReader extends JDBCTableReader {
+public class SQLite3TableReader extends JDBCTableReader {
 
 
     public SQLite3TableReader(Connection connection, String tableName,
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 790f868cc..14509c812 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -13,4 +13,5 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+org.apache.tika.parser.geopkg.GeoPkgParser
 org.apache.tika.parser.sqlite3.SQLite3Parser