You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/03 13:19:43 UTC

[tika] 02/02: TIKA-3785 -- align pipes-iterator-csv with pipes-jdbc behavior

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 184cf76023fd6a5236cb6b51ad7b1c077b8fd593
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jun 3 09:19:29 2022 -0400

    TIKA-3785 -- align pipes-iterator-csv with pipes-jdbc behavior
---
 .../pipes/pipesiterator/csv/CSVPipesIterator.java  | 81 +++++++++++-----------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java
index 17ea95420..8fb441d8b 100644
--- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java
@@ -123,12 +123,16 @@ public class CSVPipesIterator extends PipesIterator implements Initializable {
                 break;
             }
 
-            checkFetchEmitValidity(fetcherName, emitterName, fetchEmitKeyIndices, headers);
+            try {
+                checkFetchEmitValidity(fetcherName, emitterName, fetchEmitKeyIndices, headers);
+            } catch (TikaConfigException e) {
+                throw new IOException(e);
+            }
             HandlerConfig handlerConfig = getHandlerConfig();
             for (CSVRecord record : records) {
-                String id = getId(fetchEmitKeyIndices, record);
-                String fetchKey = getFetchKey(fetchEmitKeyIndices, record);
-                String emitKey = getEmitKey(fetchEmitKeyIndices, record);
+                String id = record.get(fetchEmitKeyIndices.idIndex);
+                String fetchKey = record.get(fetchEmitKeyIndices.fetchKeyIndex);
+                String emitKey = record.get(fetchEmitKeyIndices.emitKeyIndex);
                 if (StringUtils.isBlank(fetchKey) && !StringUtils.isBlank(fetcherName)) {
                     LOGGER.debug("Fetcher specified ({}), but no fetchkey was found in ({})",
                             fetcherName, record);
@@ -136,9 +140,7 @@ public class CSVPipesIterator extends PipesIterator implements Initializable {
                 if (StringUtils.isBlank(emitKey)) {
                     throw new IOException("emitKey must not be blank in :" + record);
                 }
-                if (StringUtils.isBlank(id) && ! StringUtils.isBlank(fetchKey)) {
-                    id = fetchKey;
-                }
+
                 Metadata metadata = loadMetadata(fetchEmitKeyIndices, headers, record);
                 tryToAdd(new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey),
                         new EmitKey(emitterName, emitKey), metadata, handlerConfig,
@@ -149,62 +151,51 @@ public class CSVPipesIterator extends PipesIterator implements Initializable {
 
     private void checkFetchEmitValidity(String fetcherName, String emitterName,
                                         FetchEmitKeyIndices fetchEmitKeyIndices,
-                                        List<String> headers) throws IOException {
+                                        List<String> headers) throws TikaConfigException {
 
         if (StringUtils.isBlank(emitterName)) {
-            throw new IOException(new TikaConfigException("must specify at least an emitterName"));
+            throw new TikaConfigException("must specify at least an emitterName");
         }
 
         if (StringUtils.isBlank(fetcherName) && !StringUtils.isBlank(fetchKeyColumn)) {
-            throw new IOException(new TikaConfigException("If specifying a 'fetchKeyColumn', " +
-                    "you must also specify a 'fetcherName'"));
+            new TikaConfigException("If specifying a 'fetchKeyColumn', " +
+                    "you must also specify a 'fetcherName'");
         }
 
         if (StringUtils.isBlank(fetcherName)) {
-            LOGGER.debug("No fetcher specified. This will be metadata only");
+            LOGGER.info("No fetcher specified. This will be metadata only");
         }
 
+        if (StringUtils.isBlank(fetchKeyColumn)) {
+            throw new TikaConfigException("must specify fetchKeyColumn");
+        }
         //if a fetchkeycolumn is specified, make sure that it was found
         if (!StringUtils.isBlank(fetchKeyColumn) && fetchEmitKeyIndices.fetchKeyIndex < 0) {
-            throw new IOException(new TikaConfigException(
+            throw new TikaConfigException(
                     "Couldn't find fetchKeyColumn (" + fetchKeyColumn + " in header.\n" +
-                            "These are the headers I see: " + headers));
+                            "These are the headers I see: " + headers);
         }
 
         //if an emitkeycolumn is specified, make sure that it was found
         if (!StringUtils.isBlank(emitKeyColumn) && fetchEmitKeyIndices.emitKeyIndex < 0) {
-            throw new IOException(new TikaConfigException(
+            throw new TikaConfigException(
                     "Couldn't find emitKeyColumn (" + emitKeyColumn + " in header.\n" +
-                            "These are the headers I see: " + headers));
+                            "These are the headers I see: " + headers);
+        }
+
+        //if an idcolumn is specified, make sure that it was found
+        if (!StringUtils.isBlank(idColumn) && fetchEmitKeyIndices.idIndex < 0) {
+            throw new TikaConfigException(
+                    "Couldn't find idColumn (" + idColumn + " in header.\n" +
+                            "These are the headers I see: " + headers);
         }
 
         if (StringUtils.isBlank(emitKeyColumn)) {
-            LOGGER.debug("No emitKeyColumn specified. " +
+            LOGGER.warn("No emitKeyColumn specified. " +
                             "Will use fetchKeyColumn ({}) for both the fetch key and emit key",
                     fetchKeyColumn);
         }
-    }
-
-    private String getId(FetchEmitKeyIndices fetchEmitKeyIndices, CSVRecord record) {
-        if (fetchEmitKeyIndices.idIndex > -1) {
-            return record.get(fetchEmitKeyIndices.idIndex);
-        }
-        return StringUtils.EMPTY;
-    }
-
-
-    private String getFetchKey(FetchEmitKeyIndices fetchEmitKeyIndices, CSVRecord record) {
-        if (fetchEmitKeyIndices.fetchKeyIndex > -1) {
-            return record.get(fetchEmitKeyIndices.fetchKeyIndex);
-        }
-        return StringUtils.EMPTY;
-    }
 
-    private String getEmitKey(FetchEmitKeyIndices fetchEmitKeyIndices, CSVRecord record) {
-        if (fetchEmitKeyIndices.emitKeyIndex > -1) {
-            return record.get(fetchEmitKeyIndices.emitKeyIndex);
-        }
-        return getFetchKey(fetchEmitKeyIndices, record);
     }
 
     private Metadata loadMetadata(FetchEmitKeyIndices fetchEmitKeyIndices, List<String> headers,
@@ -240,6 +231,16 @@ public class CSVPipesIterator extends PipesIterator implements Initializable {
                 idIndex = col;
             }
         }
+
+        if (StringUtils.isBlank(idColumn)) {
+            LOGGER.info("no idColumn specified, will use fetchKeyColumn");
+            idIndex = fetchKeyColumnIndex;
+        }
+
+        if (StringUtils.isBlank(emitKeyColumn)) {
+            LOGGER.info("no emitKeyColumn specified, will use fetchKeyColumn");
+            emitKeyColumnIndex = fetchKeyColumnIndex;
+        }
         return new FetchEmitKeyIndices(idIndex, fetchKeyColumnIndex, emitKeyColumnIndex);
     }
 
@@ -251,9 +252,9 @@ public class CSVPipesIterator extends PipesIterator implements Initializable {
     }
 
     private static class FetchEmitKeyIndices {
-        private final int idIndex;
+        private int idIndex;
         private final int fetchKeyIndex;
-        private final int emitKeyIndex;
+        private int emitKeyIndex;
 
         public FetchEmitKeyIndices(int idIndex, int fetchKeyIndex, int emitKeyIndex) {
             this.idIndex = idIndex;