You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/16 14:51:14 UTC
(tika) 01/01: TIKA-4198 -- add parser for geopkg
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4198
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 32eb5aa9589f8365ba858aa6286e80bb8e54473c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 16 09:50:59 2024 -0500
TIKA-4198 -- add parser for geopkg
---
.../apache/tika/parser/geopkg/GeoPkgDBParser.java | 64 +++++++++++++++
.../apache/tika/parser/geopkg/GeoPkgParser.java | 95 ++++++++++++++++++++++
.../GeoPkgTableReader.java} | 55 ++++++-------
.../tika/parser/sqlite3/SQLite3DBParser.java | 2 +-
.../tika/parser/sqlite3/SQLite3TableReader.java | 2 +-
.../services/org.apache.tika.parser.Parser | 1 +
6 files changed, 186 insertions(+), 33 deletions(-)
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
new file mode 100644
index 000000000..5dc0f9ff2
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.sqlite.SQLiteConfig;
+
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.jdbc.AbstractDBParser;
+import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3DBParser;
+
+/**
+ * This is the implementation of the db parser for SQLite.
+ * <p/>
+ * This parser is internal only; it should not be registered in the services
+ * file or configured in the TikaConfig xml file.
+ */
+class GeoPkgDBParser extends SQLite3DBParser {
+
+ @Override
+ public JDBCTableReader getTableReader(Connection connection, String tableName,
+ ParseContext context) {
+ return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context));
+ }
+
+ @Override
+ protected JDBCTableReader getTableReader(Connection connection, String tableName,
+ EmbeddedDocumentUtil embeddedDocumentUtil) {
+ return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil);
+ }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
new file mode 100644
index 000000000..6aae7cb04
--- /dev/null
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.geopkg;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.sqlite3.SQLite3Parser;
+
+/**
+ * customization of sqlite parser to skip certain common blob columns
+ */
+public class GeoPkgParser extends SQLite3Parser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -752276948656079347L;
+
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-geopackage");
+
+ private static final Set<MediaType> SUPPORTED_TYPES;
+
+ static {
+ SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+ }
+
+ /**
+ * Checks to see if class is available for org.sqlite.JDBC.
+ * <p/>
+ * If not, this class will return an EMPTY_SET for getSupportedTypes()
+ */
+ public GeoPkgParser() {
+
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ GeoPkgDBParser p = new GeoPkgDBParser();
+ p.parse(stream, handler, metadata, context);
+ }
+
+ /**
+ * No-op
+ *
+ * @param params params to use for initialization
+ * @throws TikaConfigException
+ */
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ }
+}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
similarity index 54%
copy from tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
copy to tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
index 8ddf079d3..e9b093565 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.sqlite3;
+package org.apache.tika.parser.geopkg;
import java.io.IOException;
@@ -24,58 +24,51 @@ import java.sql.ResultSet;
import java.sql.SQLException;
import javax.sql.rowset.serial.SerialBlob;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.jdbc.JDBCTableReader;
+import org.apache.tika.parser.sqlite3.SQLite3TableReader;
/**
- * Concrete class for SQLLite table parsing. This overrides
- * column type handling from JDBCRowHandler.
+ * Concrete class for GeoPkg parsing. This overrides blob handling to skip "geom" and "data"
+ * columns
* <p/>
* For now, this silently skips cells of type CLOB, because xerial's jdbc connector
* does not currently support them.
*/
-class SQLite3TableReader extends JDBCTableReader {
+class GeoPkgTableReader extends SQLite3TableReader {
-
- public SQLite3TableReader(Connection connection, String tableName,
- EmbeddedDocumentUtil embeddedDocumentUtil) {
+ private static final String GEOM = "geom";
+ private static final String DATA = "data";
+ public GeoPkgTableReader(Connection connection, String tableName,
+ EmbeddedDocumentUtil embeddedDocumentUtil) {
super(connection, tableName, embeddedDocumentUtil);
}
- /**
- * No-op for now in {@link SQLite3TableReader}.
- *
- * @param tableName
- * @param fieldName
- * @param rowNum
- * @param resultSet
- * @param columnIndex
- * @param handler
- * @param context
- * @throws java.sql.SQLException
- * @throws java.io.IOException
- * @throws org.xml.sax.SAXException
- */
+
@Override
- protected void handleClob(String tableName, String fieldName, int rowNum, ResultSet resultSet,
+ protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet,
int columnIndex, ContentHandler handler, ParseContext context)
throws SQLException, IOException, SAXException {
- //no-op for now.
- }
-
- @Override
- protected Blob getBlob(ResultSet resultSet, int columnIndex, Metadata m) throws SQLException {
- byte[] bytes = resultSet.getBytes(columnIndex);
- if (!resultSet.wasNull()) {
- return new SerialBlob(bytes);
+ if (GEOM.equals(columnName) || DATA.equals(columnName)) {
+ Attributes attrs = new AttributesImpl();
+ ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
+ ((AttributesImpl) attrs)
+ .addAttribute("", "column_name", "column_name", "CDATA", columnName);
+ ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA",
+ Integer.toString(rowNum));
+ handler.startElement("", "span", "span", attrs);
+ handler.endElement("", "span", "span");
+ return;
}
- return null;
+ super.handleBlob(tableName, columnName, rowNum, resultSet, columnIndex, handler, context);
}
}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
index 947272a0a..fd8c2e8a4 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java
@@ -47,7 +47,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
* This parser is internal only; it should not be registered in the services
* file or configured in the TikaConfig xml file.
*/
-class SQLite3DBParser extends AbstractDBParser {
+public class SQLite3DBParser extends AbstractDBParser {
protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
index 8ddf079d3..e0b5f0b27 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java
@@ -40,7 +40,7 @@ import org.apache.tika.parser.jdbc.JDBCTableReader;
* For now, this silently skips cells of type CLOB, because xerial's jdbc connector
* does not currently support them.
*/
-class SQLite3TableReader extends JDBCTableReader {
+public class SQLite3TableReader extends JDBCTableReader {
public SQLite3TableReader(Connection connection, String tableName,
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 790f868cc..14509c812 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -13,4 +13,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.tika.parser.geopkg.GeoPkgParser
org.apache.tika.parser.sqlite3.SQLite3Parser