You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/07 18:21:09 UTC

[tika] branch main updated: TIKA-3463 -- add a filelist pipes iterator into tika-core

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new b919aed  TIKA-3463 -- add a filelist pipes iterator into tika-core
b919aed is described below

commit b919aedf1f44315e16b3e23e876dee7a99808586
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 7 14:20:50 2021 -0400

    TIKA-3463 -- add a filelist pipes iterator into tika-core
---
 .../filelist/FileListPipesIterator.java            | 102 +++++++++++++++++++++
 .../filelist/FileListPipesIteratorTest.java        |  63 +++++++++++++
 .../test/resources/test-documents/file-list.txt    |  10 ++
 3 files changed, 175 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
new file mode 100644
index 0000000..7d1c0db
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator.filelist;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.fetcher.FetchKey;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Reads a list of file names/relative paths from a UTF-8 file.
+ * One file name/relative path per line.  This path is used for the fetch key,
+ * the id and the emit key.  If you need more customized control of the keys/ids,
+ * consider using the jdbc pipes iterator or the csv pipes iterator.
+ *
+ * Skips empty lines and lines starting with '#'
+ *
+ *
+ */
+public class FileListPipesIterator extends PipesIterator implements Initializable {
+
+    @Field
+    private String fileList;
+
+    @Field
+    private boolean hasHeader = false;
+
+    private Path fileListPath;
+
+    @Override
+    protected void enqueue() throws IOException, TimeoutException, InterruptedException {
+        try (BufferedReader reader = Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) {
+            if (hasHeader) {
+                reader.readLine();
+            }
+            String line = reader.readLine();
+            while (line != null) {
+                if (! line.startsWith("#") && !StringUtils.isBlank(line)) {
+                    FetchKey fetchKey = new FetchKey(getFetcherName(), line);
+                    EmitKey emitKey = new EmitKey(getEmitterName(), line);
+                    tryToAdd(new FetchEmitTuple(line, fetchKey, emitKey));
+                }
+                line = reader.readLine();
+            }
+        }
+    }
+
+
+    @Field
+    public void setFileList(String path) {
+        this.fileList = path;
+    }
+
+    @Field
+    public void setHasHeader(boolean hasHeader) {
+        this.hasHeader = hasHeader;
+    }
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        //these should all be fatal
+        TikaConfig.mustNotBeEmpty("fileList", fileList);
+        TikaConfig.mustNotBeEmpty("fetcherName", getFetcherName());
+        TikaConfig.mustNotBeEmpty("emitterName", getFetcherName());
+
+        fileListPath = Paths.get(fileList);
+        if (!Files.isRegularFile(fileListPath)) {
+            throw new TikaConfigException("file list " + fileList + " does not exist. " +
+                    "Must specify an existing file");
+        }
+    }
+
+
+}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
new file mode 100644
index 0000000..e86a988
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
@@ -0,0 +1,63 @@
+package org.apache.tika.pipes.pipesiterator.filelist;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.pipes.FetchEmitTuple;
+
+public class FileListPipesIteratorTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        Path p = Paths.get(this.getClass().getResource("/test-documents/file-list.txt").toURI());
+        FileListPipesIterator it = new FileListPipesIterator();
+        it.setFetcherName("f");
+        it.setEmitterName("e");
+        it.setFileList(p.toAbsolutePath().toString());
+        it.setHasHeader(false);
+        it.checkInitialization(InitializableProblemHandler.DEFAULT);
+        List<String> lines = new ArrayList<>();
+
+        for (FetchEmitTuple t : it) {
+            assertEquals(t.getFetchKey().getFetchKey(), t.getEmitKey().getEmitKey());
+            assertEquals(t.getId(), t.getEmitKey().getEmitKey());
+            assertEquals("f", t.getFetchKey().getFetcherName());
+            assertEquals("e", t.getEmitKey().getEmitterName());
+            lines.add(t.getId());
+        }
+        assertEquals("the", lines.get(0));
+        assertEquals(8, lines.size());
+        assertFalse(lines.contains("quick"));
+    }
+
+    @Test
+    public void testHasHeader() throws Exception {
+        Path p = Paths.get(this.getClass().getResource("/test-documents/file-list.txt").toURI());
+        FileListPipesIterator it = new FileListPipesIterator();
+        it.setFetcherName("f");
+        it.setEmitterName("e");
+        it.setFileList(p.toAbsolutePath().toString());
+        it.setHasHeader(true);
+        it.checkInitialization(InitializableProblemHandler.DEFAULT);
+        List<String> lines = new ArrayList<>();
+
+        for (FetchEmitTuple t : it) {
+            assertEquals(t.getFetchKey().getFetchKey(), t.getEmitKey().getEmitKey());
+            assertEquals(t.getId(), t.getEmitKey().getEmitKey());
+            assertEquals("f", t.getFetchKey().getFetcherName());
+            assertEquals("e", t.getEmitKey().getEmitterName());
+            lines.add(t.getId());
+        }
+        assertEquals("brown", lines.get(0));
+        assertFalse(lines.contains("quick"));
+        assertEquals(7, lines.size());
+    }
+}
diff --git a/tika-core/src/test/resources/test-documents/file-list.txt b/tika-core/src/test/resources/test-documents/file-list.txt
new file mode 100644
index 0000000..f2a73ae
--- /dev/null
+++ b/tika-core/src/test/resources/test-documents/file-list.txt
@@ -0,0 +1,10 @@
+the
+#quick
+brown
+fox
+jumps
+
+over
+the
+lazy
+dog