You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/17 15:04:40 UTC
[tika] branch main updated: TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7eb1a4225 TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter
7eb1a4225 is described below
commit 7eb1a422597ebe7089dcbd6c55719390e8beca8c
Author: tallison <ta...@apache.org>
AuthorDate: Thu Nov 17 10:04:26 2022 -0500
TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter
---
CHANGES.txt | 5 +-
tika-pipes/pom.xml | 15 +++-
.../tika/pipes/emitter/jdbc/JDBCEmitter.java | 94 +++++++++++++++++++---
.../tika/pipes/emitter/jdbc/JDBCEmitterTest.java | 38 +++++++++
.../tika-config-jdbc-emitter-multivalued.xml | 47 +++++++++++
tika-pipes/tika-fetchers/pom.xml | 3 +
6 files changed, 188 insertions(+), 14 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index c7995493b..e9b58ee10 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,9 @@
Release 2.6.1 - ???
- * Downgraded logging in PipesClient for each parse from info to debug.
+ * Add multivalued field strategy option in jdbc-emitter (TIKA-3930).
+ Default is now 'concatenate' with ', ' as the delimiter.
+
+ * Downgrade logging in PipesClient for each parse from info to debug.
Release 2.6.0 - 11/3/2022
diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml
index 1b4eb9f66..2f0f833cf 100644
--- a/tika-pipes/pom.xml
+++ b/tika-pipes/pom.xml
@@ -37,7 +37,20 @@
<module>tika-pipes-reporters</module>
<module>tika-async-cli</module>
</modules>
-
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <version>${log4j2.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j2-impl</artifactId>
+ <version>${log4j2.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
<build>
<plugins>
<plugin>
diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java
index ccd66445a..ce6fc79c8 100644
--- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java
@@ -66,6 +66,15 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
private static final Logger LOGGER = LoggerFactory.getLogger(JDBCEmitter.class);
+ public enum AttachmentStrategy {
+ FIRST_ONLY, ALL
+ //anything else?
+ }
+
+ public enum MultivaluedFieldStrategy {
+ FIRST_ONLY, CONCATENATE
+ //anything else?
+ }
//some file formats do not have time zones...
//try both
private static final String[] TIKA_DATE_PATTERNS = new String[] {
@@ -87,6 +96,11 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
private PreparedStatement insertStatement;
private AttachmentStrategy attachmentStrategy = AttachmentStrategy.FIRST_ONLY;
+ private MultivaluedFieldStrategy multivaluedFieldStrategy =
+ MultivaluedFieldStrategy.CONCATENATE;
+
+ private String multivaluedFieldDelimiter = ", ";
+
//emitters are run in a single thread. If we ever start running them
//multithreaded, this will be a big problem.
private final DateFormat[] dateFormats;
@@ -124,6 +138,44 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
this.connectionString = connectionString;
}
+ /**
+ * This applies to fields of type 'string' or 'varchar'. If there's
+ * a multivalued field in a metadata object, do you want the first value only
+ * or should we concatenate these with the
+ * {@link JDBCEmitter#setMultivaluedFieldDelimiter(String)}.
+ *
+ * The default values as of 2.6.1 are {@link MultivaluedFieldStrategy#CONCATENATE}
+ * and the default delimiter is ", "
+ *
+ * @param strategy
+ * @throws TikaConfigException
+ */
+ @Field
+ public void setMultivaluedFieldStrategy(String strategy) throws TikaConfigException {
+ String lc = strategy.toLowerCase(Locale.US);
+ if (lc.equals("first_only")) {
+ setMultivaluedFieldStrategy(MultivaluedFieldStrategy.FIRST_ONLY);
+ } else if (lc.equals("concatenate")) {
+ setMultivaluedFieldStrategy(MultivaluedFieldStrategy.CONCATENATE);
+ } else {
+ throw new TikaConfigException("I'm sorry, I only recogize 'first_only' and " +
+ "'concatenate'. I don't mind '" + strategy + "'");
+ }
+ }
+
+ public void setMultivaluedFieldStrategy(MultivaluedFieldStrategy multivaluedFieldStrategy) {
+ this.multivaluedFieldStrategy = multivaluedFieldStrategy;
+ }
+
+ /**
+ * See {@link JDBCEmitter#setMultivaluedFieldDelimiter(String)}
+ * @param delimiter
+ */
+ @Field
+ public void setMultivaluedFieldDelimiter(String delimiter) {
+ this.multivaluedFieldDelimiter = delimiter;
+ }
+
/**
* The implementation of keys should be a LinkedHashMap because
* order matters!
@@ -261,10 +313,8 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
private void updateValue(PreparedStatement insertStatement, int i, String key, String type,
int metadataListIndex, List<Metadata> metadataList)
throws SQLException {
- //for now we're only taking the info from the container document.
Metadata metadata = metadataList.get(metadataListIndex);
- String val = metadata.get(key);
-
+ String val = getVal(metadata, key, type);
String lcType = type.toLowerCase(Locale.US);
if (lcType.startsWith("varchar")) {
updateVarchar(lcType, insertStatement, i, val);
@@ -301,6 +351,35 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
}
}
+ private String getVal(Metadata metadata, String key, String type) {
+ if (! type.equals("string") && ! type.startsWith("varchar")) {
+ return metadata.get(key);
+ }
+ if (multivaluedFieldStrategy == MultivaluedFieldStrategy.FIRST_ONLY) {
+ return metadata.get(key);
+ }
+ String[] vals = metadata.getValues(key);
+ if (vals.length == 0) {
+ return null;
+ } else if (vals.length == 1) {
+ return vals[0];
+ }
+
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ for (String val : metadata.getValues(key)) {
+ if (StringUtils.isBlank(val)) {
+ continue;
+ }
+ if (i > 0) {
+ sb.append(multivaluedFieldDelimiter);
+ }
+ sb.append(val);
+ i++;
+ }
+ return sb.toString();
+ }
+
private void updateDouble(PreparedStatement insertStatement, int i, String val) throws SQLException {
if (StringUtils.isBlank(val)) {
insertStatement.setNull(i, Types.DOUBLE);
@@ -451,13 +530,4 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
}
}
- /*
- TODO: This is currently not ever called. We need rework the PipesParser
- to ensure that emitters are closed cleanly.
- */
-
- public enum AttachmentStrategy {
- FIRST_ONLY, ALL
- //anything else?
- }
}
diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java
index 2fd5148f7..873c885fd 100644
--- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java
+++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java
@@ -178,6 +178,44 @@ public class JDBCEmitterTest {
}
}
+ @Test
+ public void testMultiValuedFields(@TempDir Path tmpDir) throws Exception {
+ Files.createDirectories(tmpDir.resolve("db"));
+ Path dbDir = tmpDir.resolve("db/h2");
+ Path config = tmpDir.resolve("tika-config.xml");
+ String connectionString = "jdbc:h2:file:" + dbDir.toAbsolutePath();
+
+ writeConfig("/configs/tika-config-jdbc-emitter-multivalued.xml",
+ connectionString, config);
+
+ EmitterManager emitterManager = EmitterManager.load(config);
+ Emitter emitter = emitterManager.getEmitter();
+ List<Metadata> data = new ArrayList<>();
+ Metadata m = new Metadata();
+ m.add("k1", "first");
+ m.add("k1", "second");
+ m.add("k1", "third");
+ m.add("k1", "fourth");
+ data.add(m);
+ emitter.emit("id0", data);
+
+ String expected = "first, second, third, fourth";
+ int rows = 0;
+ try (Connection connection = DriverManager.getConnection(connectionString)) {
+ try (Statement st = connection.createStatement()) {
+ try (ResultSet rs = st.executeQuery("select * from test")) {
+ assertEquals("path", rs.getMetaData().getColumnName(1).toLowerCase(Locale.US));
+ while (rs.next()) {
+ assertEquals("id0", rs.getString(1));
+ assertEquals(expected, rs.getString(2));
+ rows++;
+ }
+ }
+ }
+ }
+ assertEquals(1, rows);
+ }
+
private void writeConfig(String srcConfig, String dbDir, Path config) throws IOException {
String xml = IOUtils.resourceToString(srcConfig, StandardCharsets.UTF_8);
xml = xml.replace("CONNECTION_STRING", dbDir);
diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml
new file mode 100644
index 000000000..a46e145f0
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <emitters>
+ <emitter class="org.apache.tika.pipes.emitter.jdbc.JDBCEmitter">
+ <params>
+ <name>jdbc</name>
+ <connection>CONNECTION_STRING</connection>
+ <createTable>create table test
+ (path varchar(512) primary key,
+ k1 varchar(512));
+ </createTable>
+ <!-- the jdbc emitter always puts ths emitKey value as the first
+ item -->
+ <insert>insert into test (path, k1) values (?,?);
+ </insert>
+ <!-- these are the keys in the metadata object.
+ The emitKey is added as the first element in the insert statement.
+ Then the these values are added in order.
+ They must be in the order of the insert statement.
+ -->
+ <keys>
+ <key k="k1" v="varchar(512)"/>
+ </keys>
+ <multivaluedFieldStrategy>concatenate</multivaluedFieldStrategy>
+ <multivaluedFieldDelimiter>, </multivaluedFieldDelimiter>
+ </params>
+ </emitter>
+ </emitters>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 958f84b79..bccb2f35a 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -38,6 +38,9 @@
<module>tika-fetcher-az-blob</module>
</modules>
+ <dependencies>
+
+ </dependencies>
<scm>
<tag>2.2.1-rc2</tag>
</scm>