You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/17 15:04:40 UTC

[tika] branch main updated: TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 7eb1a4225 TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter
7eb1a4225 is described below

commit 7eb1a422597ebe7089dcbd6c55719390e8beca8c
Author: tallison <ta...@apache.org>
AuthorDate: Thu Nov 17 10:04:26 2022 -0500

    TIKA-3930 -- Add multivalued field strategy option in jdbc-emitter
---
 CHANGES.txt                                        |  5 +-
 tika-pipes/pom.xml                                 | 15 +++-
 .../tika/pipes/emitter/jdbc/JDBCEmitter.java       | 94 +++++++++++++++++++---
 .../tika/pipes/emitter/jdbc/JDBCEmitterTest.java   | 38 +++++++++
 .../tika-config-jdbc-emitter-multivalued.xml       | 47 +++++++++++
 tika-pipes/tika-fetchers/pom.xml                   |  3 +
 6 files changed, 188 insertions(+), 14 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index c7995493b..e9b58ee10 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,9 @@
 Release 2.6.1 - ???
 
-   * Downgraded logging in PipesClient for each parse from info to debug.
+   * Add multivalued field strategy option in jdbc-emitter (TIKA-3930).
+     Default is now 'concatenate' with ', ' as the delimiter.
+
+   * Downgrade logging in PipesClient for each parse from info to debug.
 
 Release 2.6.0 - 11/3/2022
 
diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml
index 1b4eb9f66..2f0f833cf 100644
--- a/tika-pipes/pom.xml
+++ b/tika-pipes/pom.xml
@@ -37,7 +37,20 @@
     <module>tika-pipes-reporters</module>
     <module>tika-async-cli</module>
   </modules>
-
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-core</artifactId>
+      <version>${log4j2.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-slf4j2-impl</artifactId>
+      <version>${log4j2.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
   <build>
     <plugins>
       <plugin>
diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java
index ccd66445a..ce6fc79c8 100644
--- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/main/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitter.java
@@ -66,6 +66,15 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
 
     private static final Logger LOGGER = LoggerFactory.getLogger(JDBCEmitter.class);
 
+    public enum AttachmentStrategy {
+        FIRST_ONLY, ALL
+        //anything else?
+    }
+
+    public enum MultivaluedFieldStrategy {
+        FIRST_ONLY, CONCATENATE
+        //anything else?
+    }
     //some file formats do not have time zones...
     //try both
     private static final String[] TIKA_DATE_PATTERNS = new String[] {
@@ -87,6 +96,11 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
     private PreparedStatement insertStatement;
     private AttachmentStrategy attachmentStrategy = AttachmentStrategy.FIRST_ONLY;
 
+    private MultivaluedFieldStrategy multivaluedFieldStrategy =
+            MultivaluedFieldStrategy.CONCATENATE;
+
+    private String multivaluedFieldDelimiter = ", ";
+
     //emitters are run in a single thread.  If we ever start running them
     //multithreaded, this will be a big problem.
     private final DateFormat[] dateFormats;
@@ -124,6 +138,44 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
         this.connectionString = connectionString;
     }
 
+    /**
+     * This applies to fields of type 'string' or 'varchar'.  If there's
+     * a multivalued field in a metadata object, do you want the first value only
+     * or should we concatenate these with the
+     * {@link JDBCEmitter#setMultivaluedFieldDelimiter(String)}.
+     *
+     * The default values as of 2.6.1 are {@link MultivaluedFieldStrategy#CONCATENATE}
+     * and the default delimiter is &quot;, &quot;
+     *
+     * @param strategy
+     * @throws TikaConfigException
+     */
+    @Field
+    public void setMultivaluedFieldStrategy(String strategy) throws TikaConfigException {
+        String lc = strategy.toLowerCase(Locale.US);
+        if (lc.equals("first_only")) {
+            setMultivaluedFieldStrategy(MultivaluedFieldStrategy.FIRST_ONLY);
+        } else if (lc.equals("concatenate")) {
+            setMultivaluedFieldStrategy(MultivaluedFieldStrategy.CONCATENATE);
+        } else {
+            throw new TikaConfigException("I'm sorry, I only recogize 'first_only' and " +
+                    "'concatenate'. I don't mind '" + strategy + "'");
+        }
+    }
+
+    public void setMultivaluedFieldStrategy(MultivaluedFieldStrategy multivaluedFieldStrategy) {
+        this.multivaluedFieldStrategy = multivaluedFieldStrategy;
+    }
+
+    /**
+     * See {@link JDBCEmitter#setMultivaluedFieldDelimiter(String)}
+      * @param delimiter
+     */
+    @Field
+    public void setMultivaluedFieldDelimiter(String delimiter) {
+        this.multivaluedFieldDelimiter = delimiter;
+    }
+
     /**
      * The implementation of keys should be a LinkedHashMap because
      * order matters!
@@ -261,10 +313,8 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
     private void updateValue(PreparedStatement insertStatement, int i, String key, String type,
                              int metadataListIndex, List<Metadata> metadataList)
             throws SQLException {
-        //for now we're only taking the info from the container document.
         Metadata metadata = metadataList.get(metadataListIndex);
-        String val = metadata.get(key);
-
+        String val = getVal(metadata, key, type);
         String lcType = type.toLowerCase(Locale.US);
         if (lcType.startsWith("varchar")) {
             updateVarchar(lcType, insertStatement, i, val);
@@ -301,6 +351,35 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
         }
     }
 
+    private String getVal(Metadata metadata, String key, String type) {
+        if (! type.equals("string") && ! type.startsWith("varchar")) {
+            return metadata.get(key);
+        }
+        if (multivaluedFieldStrategy == MultivaluedFieldStrategy.FIRST_ONLY) {
+            return metadata.get(key);
+        }
+        String[] vals = metadata.getValues(key);
+        if (vals.length == 0) {
+            return null;
+        } else if (vals.length == 1) {
+            return vals[0];
+        }
+
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        for (String val : metadata.getValues(key)) {
+            if (StringUtils.isBlank(val)) {
+                continue;
+            }
+            if (i > 0) {
+                sb.append(multivaluedFieldDelimiter);
+            }
+            sb.append(val);
+            i++;
+        }
+        return sb.toString();
+    }
+
     private void updateDouble(PreparedStatement insertStatement, int i, String val) throws SQLException {
         if (StringUtils.isBlank(val)) {
             insertStatement.setNull(i, Types.DOUBLE);
@@ -451,13 +530,4 @@ public class JDBCEmitter extends AbstractEmitter implements Initializable, Close
         }
     }
 
-    /*
-        TODO: This is currently not ever called.  We need rework the PipesParser
-        to ensure that emitters are closed cleanly.
-     */
-
-    public enum AttachmentStrategy {
-        FIRST_ONLY, ALL
-        //anything else?
-    }
 }
diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java
index 2fd5148f7..873c885fd 100644
--- a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java
+++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/java/org/apache/tika/pipes/emitter/jdbc/JDBCEmitterTest.java
@@ -178,6 +178,44 @@ public class JDBCEmitterTest {
         }
     }
 
+    @Test
+    public void testMultiValuedFields(@TempDir Path tmpDir) throws Exception {
+        Files.createDirectories(tmpDir.resolve("db"));
+        Path dbDir = tmpDir.resolve("db/h2");
+        Path config = tmpDir.resolve("tika-config.xml");
+        String connectionString = "jdbc:h2:file:" + dbDir.toAbsolutePath();
+
+        writeConfig("/configs/tika-config-jdbc-emitter-multivalued.xml",
+                connectionString, config);
+
+        EmitterManager emitterManager = EmitterManager.load(config);
+        Emitter emitter = emitterManager.getEmitter();
+        List<Metadata> data = new ArrayList<>();
+        Metadata m = new Metadata();
+        m.add("k1", "first");
+        m.add("k1", "second");
+        m.add("k1", "third");
+        m.add("k1", "fourth");
+        data.add(m);
+        emitter.emit("id0", data);
+
+        String expected = "first, second, third, fourth";
+        int rows = 0;
+        try (Connection connection = DriverManager.getConnection(connectionString)) {
+            try (Statement st = connection.createStatement()) {
+                try (ResultSet rs = st.executeQuery("select * from test")) {
+                    assertEquals("path", rs.getMetaData().getColumnName(1).toLowerCase(Locale.US));
+                    while (rs.next()) {
+                        assertEquals("id0", rs.getString(1));
+                        assertEquals(expected, rs.getString(2));
+                        rows++;
+                    }
+                }
+            }
+        }
+        assertEquals(1, rows);
+    }
+
     private void writeConfig(String srcConfig, String dbDir, Path config) throws IOException {
         String xml = IOUtils.resourceToString(srcConfig, StandardCharsets.UTF_8);
         xml = xml.replace("CONNECTION_STRING", dbDir);
diff --git a/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml
new file mode 100644
index 000000000..a46e145f0
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-jdbc/src/test/resources/configs/tika-config-jdbc-emitter-multivalued.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+  <emitters>
+    <emitter class="org.apache.tika.pipes.emitter.jdbc.JDBCEmitter">
+      <params>
+        <name>jdbc</name>
+        <connection>CONNECTION_STRING</connection>
+        <createTable>create table test
+          (path varchar(512) primary key,
+          k1 varchar(512));
+        </createTable>
+        <!-- the jdbc emitter always puts ths emitKey value as the first
+             item -->
+        <insert>insert into test (path, k1) values (?,?);
+        </insert>
+        <!-- these are the keys in the metadata object.
+            The emitKey is added as the first element in the insert statement.
+            Then the these values are added in order.
+            They must be in the order of the insert statement.
+            -->
+        <keys>
+          <key k="k1" v="varchar(512)"/>
+        </keys>
+        <multivaluedFieldStrategy>concatenate</multivaluedFieldStrategy>
+        <multivaluedFieldDelimiter>, </multivaluedFieldDelimiter>
+      </params>
+    </emitter>
+  </emitters>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 958f84b79..bccb2f35a 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -38,6 +38,9 @@
     <module>tika-fetcher-az-blob</module>
   </modules>
 
+  <dependencies>
+
+  </dependencies>
   <scm>
     <tag>2.2.1-rc2</tag>
   </scm>