You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/07 18:21:09 UTC
[tika] branch main updated: TIKA-3463 -- add a filelist pipes
iterator into tika-core
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new b919aed TIKA-3463 -- add a filelist pipes iterator into tika-core
b919aed is described below
commit b919aedf1f44315e16b3e23e876dee7a99808586
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jul 7 14:20:50 2021 -0400
TIKA-3463 -- add a filelist pipes iterator into tika-core
---
.../filelist/FileListPipesIterator.java | 102 +++++++++++++++++++++
.../filelist/FileListPipesIteratorTest.java | 63 +++++++++++++
.../test/resources/test-documents/file-list.txt | 10 ++
3 files changed, 175 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
new file mode 100644
index 0000000..7d1c0db
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator.filelist;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.fetcher.FetchKey;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Reads a list of file names/relative paths from a UTF-8 file.
+ * One file name/relative path per line. This path is used for the fetch key,
+ * the id and the emit key. If you need more customized control of the keys/ids,
+ * consider using the jdbc pipes iterator or the csv pipes iterator.
+ *
+ * Skips empty lines and lines starting with '#'
+ *
+ *
+ */
+public class FileListPipesIterator extends PipesIterator implements Initializable {
+
+ @Field
+ private String fileList;
+
+ @Field
+ private boolean hasHeader = false;
+
+ private Path fileListPath;
+
+ @Override
+ protected void enqueue() throws IOException, TimeoutException, InterruptedException {
+ try (BufferedReader reader = Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) {
+ if (hasHeader) {
+ reader.readLine();
+ }
+ String line = reader.readLine();
+ while (line != null) {
+ if (! line.startsWith("#") && !StringUtils.isBlank(line)) {
+ FetchKey fetchKey = new FetchKey(getFetcherName(), line);
+ EmitKey emitKey = new EmitKey(getEmitterName(), line);
+ tryToAdd(new FetchEmitTuple(line, fetchKey, emitKey));
+ }
+ line = reader.readLine();
+ }
+ }
+ }
+
+
+ @Field
+ public void setFileList(String path) {
+ this.fileList = path;
+ }
+
+ @Field
+ public void setHasHeader(boolean hasHeader) {
+ this.hasHeader = hasHeader;
+ }
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ //these should all be fatal
+ TikaConfig.mustNotBeEmpty("fileList", fileList);
+ TikaConfig.mustNotBeEmpty("fetcherName", getFetcherName());
+ TikaConfig.mustNotBeEmpty("emitterName", getFetcherName());
+
+ fileListPath = Paths.get(fileList);
+ if (!Files.isRegularFile(fileListPath)) {
+ throw new TikaConfigException("file list " + fileList + " does not exist. " +
+ "Must specify an existing file");
+ }
+ }
+
+
+}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
new file mode 100644
index 0000000..e86a988
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
@@ -0,0 +1,63 @@
+package org.apache.tika.pipes.pipesiterator.filelist;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.pipes.FetchEmitTuple;
+
+public class FileListPipesIteratorTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ Path p = Paths.get(this.getClass().getResource("/test-documents/file-list.txt").toURI());
+ FileListPipesIterator it = new FileListPipesIterator();
+ it.setFetcherName("f");
+ it.setEmitterName("e");
+ it.setFileList(p.toAbsolutePath().toString());
+ it.setHasHeader(false);
+ it.checkInitialization(InitializableProblemHandler.DEFAULT);
+ List<String> lines = new ArrayList<>();
+
+ for (FetchEmitTuple t : it) {
+ assertEquals(t.getFetchKey().getFetchKey(), t.getEmitKey().getEmitKey());
+ assertEquals(t.getId(), t.getEmitKey().getEmitKey());
+ assertEquals("f", t.getFetchKey().getFetcherName());
+ assertEquals("e", t.getEmitKey().getEmitterName());
+ lines.add(t.getId());
+ }
+ assertEquals("the", lines.get(0));
+ assertEquals(8, lines.size());
+ assertFalse(lines.contains("quick"));
+ }
+
+ @Test
+ public void testHasHeader() throws Exception {
+ Path p = Paths.get(this.getClass().getResource("/test-documents/file-list.txt").toURI());
+ FileListPipesIterator it = new FileListPipesIterator();
+ it.setFetcherName("f");
+ it.setEmitterName("e");
+ it.setFileList(p.toAbsolutePath().toString());
+ it.setHasHeader(true);
+ it.checkInitialization(InitializableProblemHandler.DEFAULT);
+ List<String> lines = new ArrayList<>();
+
+ for (FetchEmitTuple t : it) {
+ assertEquals(t.getFetchKey().getFetchKey(), t.getEmitKey().getEmitKey());
+ assertEquals(t.getId(), t.getEmitKey().getEmitKey());
+ assertEquals("f", t.getFetchKey().getFetcherName());
+ assertEquals("e", t.getEmitKey().getEmitterName());
+ lines.add(t.getId());
+ }
+ assertEquals("brown", lines.get(0));
+ assertFalse(lines.contains("quick"));
+ assertEquals(7, lines.size());
+ }
+}
diff --git a/tika-core/src/test/resources/test-documents/file-list.txt b/tika-core/src/test/resources/test-documents/file-list.txt
new file mode 100644
index 0000000..f2a73ae
--- /dev/null
+++ b/tika-core/src/test/resources/test-documents/file-list.txt
@@ -0,0 +1,10 @@
+the
+#quick
+brown
+fox
+jumps
+
+over
+the
+lazy
+dog