You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/08/16 17:56:35 UTC

[tika] branch main updated (a2b21f8 -> 48d9389)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from a2b21f8  TIKA-3510 -- further fixes
     new 5985b62  TIKA-3510 -- further clean up
     new f916619  TIKA-3523 -- improve documentation
     new 48d9389  TIKA-3524 -- add tika-pipes support for google cloud storage

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tika/pipes/fetcher/fs/FileSystemFetcher.java   |  10 ++
 tika-parent/pom.xml                                |   1 +
 .../tika-parser-scientific-module/pom.xml          |  16 +-
 .../tika-parser-scientific-package/pom.xml         | 171 +------------------
 .../tika-parser-sqlite3-module/pom.xml             |   2 -
 .../tika-parser-sqlite3-package/pom.xml            |  10 --
 tika-pipes/tika-emitters/pom.xml                   |   1 +
 .../{tika-emitter-fs => tika-emitter-gcs}/pom.xml  |  18 +-
 .../apache/tika/pipes/emitter/gcs/GCSEmitter.java  | 184 +++++++++++++++++++++
 .../tika/pipes/emitter/gcs/TestGCSEmitter.java     |  34 ++--
 .../src/test/resources/config/tika-config-gcs.xml  |  13 +-
 tika-pipes/tika-fetchers/pom.xml                   |   1 +
 .../{tika-fetcher-s3 => tika-fetcher-gcs}/pom.xml  |  54 +-----
 .../apache/tika/pipes/fetcher/gcs/GCSFetcher.java  | 135 +++++++++++++++
 .../tika/pipes/fetcher/s3/TestGCSFetcher.java}     |  36 ++--
 .../src/test/resources/tika-config-gcs.xml}        |   6 +-
 tika-pipes/tika-pipes-iterators/pom.xml            |   1 +
 .../pom.xml                                        |  13 +-
 .../pipes/pipesiterator/gcs/GCSPipesIterator.java} |  90 ++++------
 .../pipesiterator/gcs/TestGCSPipesIterator.java}   |  22 +--
 .../src/test/resources/log4j.properties            |   0
 21 files changed, 457 insertions(+), 361 deletions(-)
 copy tika-pipes/tika-emitters/{tika-emitter-fs => tika-emitter-gcs}/pom.xml (92%)
 create mode 100644 tika-pipes/tika-emitters/tika-emitter-gcs/src/main/java/org/apache/tika/pipes/emitter/gcs/GCSEmitter.java
 copy tika-server/tika-server-client/src/test/java/org/apache/tika/server/client/TestBasic.java => tika-pipes/tika-emitters/tika-emitter-gcs/src/test/java/org/apache/tika/pipes/emitter/gcs/TestGCSEmitter.java (51%)
 copy tika-core/src/test/resources/org/apache/tika/config/emitters-duplicate-config.xml => tika-pipes/tika-emitters/tika-emitter-gcs/src/test/resources/config/tika-config-gcs.xml (76%)
 copy tika-pipes/tika-fetchers/{tika-fetcher-s3 => tika-fetcher-gcs}/pom.xml (71%)
 create mode 100644 tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java
 copy tika-pipes/tika-fetchers/{tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java => tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java} (68%)
 copy tika-pipes/tika-fetchers/{tika-fetcher-http/src/test/resources/tika-config-http.xml => tika-fetcher-gcs/src/test/resources/tika-config-gcs.xml} (80%)
 copy tika-pipes/tika-pipes-iterators/{tika-pipes-iterator-jdbc => tika-pipes-iterator-gcs}/pom.xml (92%)
 copy tika-pipes/tika-pipes-iterators/{tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java => tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java} (53%)
 copy tika-pipes/tika-pipes-iterators/{tika-pipes-iterator-s3/src/test/java/org/apache/tika/pipes/pipesiterator/s3/TestS3PipesIterator.java => tika-pipes-iterator-gcs/src/test/java/org/apache/tika/pipes/pipesiterator/gcs/TestGCSPipesIterator.java} (86%)
 copy tika-pipes/tika-pipes-iterators/{tika-pipes-iterator-s3 => tika-pipes-iterator-gcs}/src/test/resources/log4j.properties (100%)

[tika] 02/03: TIKA-3523 -- improve documentation

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f916619f32000cbc872bcd6a064243284e9897a5
Author: tallison <ta...@apache.org>
AuthorDate: Mon Aug 16 10:25:34 2021 -0400

    TIKA-3523 -- improve documentation
---
 .../org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java
index 5c22e7f..b104b53 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java
@@ -100,7 +100,17 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable
             throws TikaConfigException {
         if (basePath == null || basePath.toString().trim().length() == 0) {
             throw new TikaConfigException("'basePath' must be specified");
+        } else if (basePath.toString().startsWith("http://")) {
+            throw new TikaConfigException("FileSystemFetcher only works with local file systems. " +
+                    " Please use the tika-fetcher-http module for http calls");
+        } else if (basePath.toString().startsWith("ftp://")) {
+            throw new TikaConfigException("FileSystemFetcher only works with local file systems. " +
+                    " Please consider contributing an ftp fetcher module");
+        } else if (basePath.startsWith("s3://")) {
+            throw new TikaConfigException("FileSystemFetcher only works with local file systems. " +
+                    " Please use the tika-fetcher-s3 module");
         }
+
         if (basePath.toAbsolutePath().toString().contains("\u0000")) {
             throw new TikaConfigException(
                     "base path must not contain \u0000. " + "Seriously, what were you thinking?");

[tika] 03/03: TIKA-3524 -- add tika-pipes support for google cloud storage

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 48d9389125e217a9c6301840ff39a3c4ee13d742
Author: tallison <ta...@apache.org>
AuthorDate: Mon Aug 16 13:56:14 2021 -0400

    TIKA-3524 -- add tika-pipes support for google cloud storage
---
 tika-parent/pom.xml                                |   1 +
 tika-pipes/tika-emitters/pom.xml                   |   1 +
 tika-pipes/tika-emitters/tika-emitter-gcs/pom.xml  | 118 +++++++++++++
 .../apache/tika/pipes/emitter/gcs/GCSEmitter.java  | 184 +++++++++++++++++++++
 .../tika/pipes/emitter/gcs/TestGCSEmitter.java     |  51 ++++++
 .../src/test/resources/config/tika-config-gcs.xml  |  28 ++++
 tika-pipes/tika-fetchers/pom.xml                   |   1 +
 tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml  | 110 ++++++++++++
 .../apache/tika/pipes/fetcher/gcs/GCSFetcher.java  | 135 +++++++++++++++
 .../tika/pipes/fetcher/s3/TestGCSFetcher.java      |  63 +++++++
 .../src/test/resources/tika-config-gcs.xml         |  28 ++++
 tika-pipes/tika-pipes-iterators/pom.xml            |   1 +
 .../tika-pipes-iterator-gcs/pom.xml                | 112 +++++++++++++
 .../pipes/pipesiterator/gcs/GCSPipesIterator.java  | 123 ++++++++++++++
 .../pipesiterator/gcs/TestGCSPipesIterator.java    | 102 ++++++++++++
 .../src/test/resources/log4j.properties            |  22 +++
 16 files changed, 1080 insertions(+)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 0fe2354..753b0de 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -303,6 +303,7 @@
     <!-- fakeload versions &gt; 0.4.0 require java > 8 -->
     <fakeload.version>0.4.0</fakeload.version>
     <geoapi.version>3.0.1</geoapi.version>
+    <google.cloud.version>2.0.1</google.cloud.version>
     <gson.version>2.8.7</gson.version>
     <guava.version>30.1.1-jre</guava.version>
     <h2.version>1.4.200</h2.version>
diff --git a/tika-pipes/tika-emitters/pom.xml b/tika-pipes/tika-emitters/pom.xml
index e6a338e..db1a4b6 100644
--- a/tika-pipes/tika-emitters/pom.xml
+++ b/tika-pipes/tika-emitters/pom.xml
@@ -36,5 +36,6 @@
     <module>tika-emitter-s3</module>
     <module>tika-emitter-solr</module>
     <module>tika-emitter-opensearch</module>
+    <module>tika-emitter-gcs</module>
   </modules>
 </project>
\ No newline at end of file
diff --git a/tika-pipes/tika-emitters/tika-emitter-gcs/pom.xml b/tika-pipes/tika-emitters/tika-emitter-gcs/pom.xml
new file mode 100644
index 0000000..9b17e51
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-gcs/pom.xml
@@ -0,0 +1,118 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <parent>
+    <artifactId>tika-emitters</artifactId>
+    <groupId>org.apache.tika</groupId>
+    <version>2.1.0-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+
+  <artifactId>tika-emitter-gcs</artifactId>
+  <name>Apache Tika GCS emitter</name>
+
+
+  <dependencies>
+    <!-- should serialization be provided or bundled? -->
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.cloud</groupId>
+      <artifactId>google-cloud-storage</artifactId>
+      <version>${google.cloud.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifestEntries>
+              <Automatic-Module-Name>org.apache.tika.pipes.emitter.gcs</Automatic-Module-Name>
+            </manifestEntries>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>${maven.shade.version}</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <createDependencyReducedPom>
+                false
+              </createDependencyReducedPom>
+              <!-- <filters> -->
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/*</exclude>
+                    <exclude>LICENSE.txt</exclude>
+                    <exclude>NOTICE.txt</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                  <resource>META-INF/LICENSE</resource>
+                  <file>target/classes/META-INF/LICENSE</file>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                  <resource>META-INF/NOTICE</resource>
+                  <file>target/classes/META-INF/NOTICE</file>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                  <resource>META-INF/DEPENDENCIES</resource>
+                  <file>target/classes/META-INF/DEPENDENCIES</file>
+                </transformer>
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+    </plugins>
+  </build>
+</project>
\ No newline at end of file
diff --git a/tika-pipes/tika-emitters/tika-emitter-gcs/src/main/java/org/apache/tika/pipes/emitter/gcs/GCSEmitter.java b/tika-pipes/tika-emitters/tika-emitter-gcs/src/main/java/org/apache/tika/pipes/emitter/gcs/GCSEmitter.java
new file mode 100644
index 0000000..e4c03c4
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-gcs/src/main/java/org/apache/tika/pipes/emitter/gcs/GCSEmitter.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.emitter.gcs;
+
+import static org.apache.tika.config.TikaConfig.mustNotBeEmpty;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.Map;
+
+import com.google.cloud.storage.BlobId;
+import com.google.cloud.storage.BlobInfo;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.pipes.emitter.AbstractEmitter;
+import org.apache.tika.pipes.emitter.StreamEmitter;
+import org.apache.tika.pipes.emitter.TikaEmitterException;
+import org.apache.tika.utils.StringUtils;
+
+
+public class GCSEmitter extends AbstractEmitter implements Initializable, StreamEmitter {
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(GCSEmitter.class);
+    private String projectId;
+    private String bucket;
+    private String fileExtension = "json";
+    private String prefix = null;
+    private Storage storage;
+
+    /**
+     * Requires the src-bucket/path/to/my/file.txt in the {@link TikaCoreProperties#SOURCE_PATH}.
+     *
+     * @param metadataList
+     * @throws IOException
+     * @throws TikaException
+     */
+    @Override
+    public void emit(String emitKey, List<Metadata> metadataList)
+            throws IOException, TikaEmitterException {
+        if (metadataList == null || metadataList.size() == 0) {
+            throw new TikaEmitterException("metadata list must not be null or of size 0");
+        }
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (Writer writer = new OutputStreamWriter(bos, StandardCharsets.UTF_8)) {
+            JsonMetadataList.toJson(metadataList, writer);
+        } catch (IOException e) {
+            throw new TikaEmitterException("can't jsonify", e);
+        }
+
+        write(emitKey, new Metadata(), bos.toByteArray());
+
+    }
+
+    /**
+     * @param path         -- object path, not including the bucket
+     * @param is           inputStream to copy
+     * @param userMetadata this will be written to the s3 ObjectMetadata's userMetadata
+     * @throws TikaEmitterException or IOexception if there is a Runtime s3 client exception
+     */
+    @Override
+    public void emit(String path, InputStream is, Metadata userMetadata)
+            throws IOException, TikaEmitterException {
+
+        if (is instanceof TikaInputStream && ((TikaInputStream) is).hasFile()) {
+            write(path, userMetadata, Files.readAllBytes(((TikaInputStream) is).getPath()));
+        } else {
+            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+            IOUtils.copy(is, bos);
+            write(path, userMetadata, bos.toByteArray());
+        }
+    }
+
+    private void write(String path, Metadata userMetadata, byte[] bytes) {
+        if (!StringUtils.isBlank(prefix)) {
+            path = prefix + "/" + path;
+        }
+
+        if (!StringUtils.isBlank(fileExtension)) {
+            path += "." + fileExtension;
+        }
+
+        LOGGER.debug("about to emit to target bucket: ({}) path:({})", bucket, path);
+        BlobId blobId = BlobId.of(bucket, path);
+        BlobInfo blobInfo = BlobInfo.newBuilder(blobId).build();
+
+        for (String n : userMetadata.names()) {
+            String[] vals = userMetadata.getValues(n);
+            if (vals.length > 1) {
+                LOGGER.warn("Can only write the first value for key {}. I see {} values.", n,
+                        vals.length);
+            }
+            blobInfo.getMetadata().put(n, vals[0]);
+        }
+        storage.create(blobInfo, bytes);
+    }
+
+
+    @Field
+    public void setProjectId(String projectId) {
+        this.projectId = projectId;
+    }
+
+    @Field
+    public void setBucket(String bucket) {
+        this.bucket = bucket;
+    }
+
+    @Field
+    public void setPrefix(String prefix) {
+        //strip final "/" if it exists
+        if (prefix.endsWith("/")) {
+            this.prefix = prefix.substring(0, prefix.length() - 1);
+        } else {
+            this.prefix = prefix;
+        }
+    }
+
+    /**
+     * If you want to customize the output file's file extension.
+     * Do not include the "."
+     *
+     * @param fileExtension
+     */
+    @Field
+    public void setFileExtension(String fileExtension) {
+        this.fileExtension = fileExtension;
+    }
+
+
+    /**
+     * This initializes the gcs client.
+     *
+     * @param params params to use for initialization
+     * @throws TikaConfigException
+     */
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+        //params have already been set...ignore them
+        //TODO -- add other params to the builder as needed
+        storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        mustNotBeEmpty("bucket", this.bucket);
+        mustNotBeEmpty("projectId", this.projectId);
+    }
+
+}
diff --git a/tika-pipes/tika-emitters/tika-emitter-gcs/src/test/java/org/apache/tika/pipes/emitter/gcs/TestGCSEmitter.java b/tika-pipes/tika-emitters/tika-emitter-gcs/src/test/java/org/apache/tika/pipes/emitter/gcs/TestGCSEmitter.java
new file mode 100644
index 0000000..aaee49e
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-gcs/src/test/java/org/apache/tika/pipes/emitter/gcs/TestGCSEmitter.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.emitter.gcs;
+
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.emitter.Emitter;
+import org.apache.tika.pipes.emitter.EmitterManager;
+
+@Disabled("turn into an actual test")
+public class TestGCSEmitter {
+
+    @Test
+    public void testBasic() throws Exception {
+        EmitterManager emitterManager = EmitterManager.load(getConfig("tika-config-gcs.xml"));
+        Emitter emitter = emitterManager.getEmitter("gcs");
+        List<Metadata> metadataList = new ArrayList<>();
+        Metadata m = new Metadata();
+        m.set("k1", "v1");
+        m.add("k1", "v2");
+        m.set("k2", "v3");
+        metadataList.add(m);
+        emitter.emit("something-or-other/test-out", metadataList);
+    }
+
+    private Path getConfig(String configFile) throws URISyntaxException {
+        return Paths.get(this.getClass().getResource("/config/" + configFile).toURI());
+    }
+}
diff --git a/tika-pipes/tika-emitters/tika-emitter-gcs/src/test/resources/config/tika-config-gcs.xml b/tika-pipes/tika-emitters/tika-emitter-gcs/src/test/resources/config/tika-config-gcs.xml
new file mode 100644
index 0000000..b45ec31
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-gcs/src/test/resources/config/tika-config-gcs.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <emitters>
+        <emitter class="org.apache.tika.pipes.emitter.gcs.GCSEmitter">
+            <params>
+                <name>gcs</name>
+                <projectId>My First Project</projectId>
+                <bucket>tika-tallison-test-bucket</bucket>
+            </params>
+        </emitter>
+    </emitters>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 8fa2681..3086175 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -34,5 +34,6 @@
   <modules>
     <module>tika-fetcher-http</module>
     <module>tika-fetcher-s3</module>
+    <module>tika-fetcher-gcs</module>
   </modules>
 </project>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml
new file mode 100644
index 0000000..0e47e64
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>tika-fetchers</artifactId>
+        <groupId>org.apache.tika</groupId>
+        <version>2.1.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>tika-fetcher-gcs</artifactId>
+    <name>Apache Tika Google Cloud Storage fetcher</name>
+
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.cloud</groupId>
+            <artifactId>google-cloud-storage</artifactId>
+            <version>${google.cloud.version}</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            <Automatic-Module-Name>org.apache.tika.pipes.fetcher.gcs</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    <file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    <file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+</project>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java
new file mode 100644
index 0000000..6881c5a
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.gcs;
+
+import static org.apache.tika.config.TikaConfig.mustNotBeEmpty;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.util.Map;
+
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.BlobId;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+
+/**
+ * Fetches files from google cloud storage. Must set projectId and bucket via the config.
+ */
+public class GCSFetcher extends AbstractFetcher implements Initializable {
+
+    private static String PREFIX = "gcs";
+    private static final Logger LOGGER = LoggerFactory.getLogger(GCSFetcher.class);
+    private String projectId;
+    private String bucket;
+    private boolean extractUserMetadata = true;
+    private Storage storage;
+    private boolean spoolToTemp = true;
+
+    @Override
+    public InputStream fetch(String fetchKey, Metadata metadata) throws TikaException, IOException {
+
+        LOGGER.debug("about to fetch fetchkey={} from bucket ({})", fetchKey, bucket);
+
+        try {
+            Blob blob = storage.get(BlobId.of(bucket, fetchKey));
+
+            if (extractUserMetadata) {
+                if (blob.getMetadata() != null) {
+                    for (Map.Entry<String, String> e : blob.getMetadata().entrySet()) {
+                        metadata.add(PREFIX + ":" + e.getKey(), e.getValue());
+                    }
+                }
+            }
+            if (!spoolToTemp) {
+                return TikaInputStream.get(blob.getContent());
+            } else {
+                long start = System.currentTimeMillis();
+                TemporaryResources tmpResources = new TemporaryResources();
+                Path tmp = tmpResources.createTempFile();
+                blob.downloadTo(tmp);
+                TikaInputStream tis = TikaInputStream.get(tmp, metadata, tmpResources);
+                long elapsed = System.currentTimeMillis() - start;
+                LOGGER.debug("took {} ms to copy to local tmp file", elapsed);
+                return tis;
+            }
+        } catch (Exception e) {
+            throw new IOException("gcs storage exception", e);
+        }
+    }
+
+    @Field
+    public void setSpoolToTemp(boolean spoolToTemp) {
+        this.spoolToTemp = spoolToTemp;
+    }
+
+    @Field
+    public void setProjectId(String projectId) {
+        this.projectId = projectId;
+    }
+
+    @Field
+    public void setBucket(String bucket) {
+        this.bucket = bucket;
+    }
+
+    /**
+     * Whether or not to extract user metadata from the S3Object
+     *
+     * @param extractUserMetadata
+     */
+    @Field
+    public void setExtractUserMetadata(boolean extractUserMetadata) {
+        this.extractUserMetadata = extractUserMetadata;
+    }
+
+    //TODO: parameterize extracting other blob metadata, eg. md5, crc, etc.
+
+    /**
+     * This initializes the gcs storage client.
+     *
+     * @param params params to use for initialization
+     * @throws TikaConfigException
+     */
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+        //params have already been set...ignore them
+        //TODO -- add other params to the builder as needed
+        storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        mustNotBeEmpty("bucket", this.bucket);
+        mustNotBeEmpty("projectId", this.projectId);
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java
new file mode 100644
index 0000000..35aabbe
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.s3;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.Fetcher;
+import org.apache.tika.pipes.fetcher.FetcherManager;
+
+@Disabled("write actual unit tests")
+public class TestGCSFetcher {
+
+    private static final String FETCH_STRING = "testExtraSpaces.pdf";
+    private static Path outputFile;
+
+    @BeforeAll
+    public static void setUp() throws Exception {
+        outputFile = Files.createTempFile("tika-test", ".pdf");
+    }
+
+    @AfterAll
+    public static void tearDown() throws Exception {
+        Files.delete(outputFile);
+    }
+
+    @Test
+    public void testConfig() throws Exception {
+        FetcherManager fetcherManager = FetcherManager.load(
+                Paths.get(this.getClass().getResource("/tika-config-gcs.xml").toURI()));
+        Fetcher fetcher = fetcherManager.getFetcher("gcs");
+        Metadata metadata = new Metadata();
+        try (InputStream is = fetcher.fetch(FETCH_STRING, metadata)) {
+            Files.copy(is, outputFile, StandardCopyOption.REPLACE_EXISTING);
+        }
+        assertEquals(20743, Files.size(outputFile));
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/resources/tika-config-gcs.xml b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/resources/tika-config-gcs.xml
new file mode 100644
index 0000000..eee110d
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/resources/tika-config-gcs.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <fetchers>
+        <fetcher class="org.apache.tika.pipes.fetcher.gcs.GCSFetcher">
+            <params>
+                <name>gcs</name>
+                <projectId>My First Project</projectId>
+                <bucket>tika-tallison-test-bucket</bucket>
+            </params>
+        </fetcher>
+    </fetchers>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-pipes-iterators/pom.xml b/tika-pipes/tika-pipes-iterators/pom.xml
index 72a5912..337147d 100644
--- a/tika-pipes/tika-pipes-iterators/pom.xml
+++ b/tika-pipes/tika-pipes-iterators/pom.xml
@@ -38,5 +38,6 @@
     <module>tika-pipes-iterator-jdbc</module>
     <module>tika-pipes-iterator-s3</module>
     <module>tika-pipes-iterator-solr</module>
+    <module>tika-pipes-iterator-gcs</module>
   </modules>
 </project>
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/pom.xml b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/pom.xml
new file mode 100644
index 0000000..5b37973
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/pom.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-pipes-iterators</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+
+  <artifactId>tika-pipes-iterator-gcs</artifactId>
+
+  <name>Apache Tika Fetch Iterator - Google Cloud Storage</name>
+  <url>https://tika.apache.org/</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.cloud</groupId>
+      <artifactId>google-cloud-storage</artifactId>
+      <version>${google.cloud.version}</version>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifestEntries>
+              <Automatic-Module-Name>org.apache.tika.pipes.pipesiterator.s3</Automatic-Module-Name>
+            </manifestEntries>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>${maven.shade.version}</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <createDependencyReducedPom>
+                false
+              </createDependencyReducedPom>
+              <!-- <filters> -->
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/*</exclude>
+                    <exclude>LICENSE.txt</exclude>
+                    <exclude>NOTICE.txt</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                  <resource>META-INF/LICENSE</resource>
+                  <file>target/classes/META-INF/LICENSE</file>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                  <resource>META-INF/NOTICE</resource>
+                  <file>target/classes/META-INF/NOTICE</file>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                  <resource>META-INF/DEPENDENCIES</resource>
+                  <file>target/classes/META-INF/DEPENDENCIES</file>
+                </transformer>
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+    </plugins>
+  </build>
+</project>
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java
new file mode 100644
index 0000000..a9d052b
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator.gcs;
+
+import static org.apache.tika.config.TikaConfig.mustNotBeEmpty;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.concurrent.TimeoutException;
+
+import com.google.api.gax.paging.Page;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.HandlerConfig;
+import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.fetcher.FetchKey;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+import org.apache.tika.utils.StringUtils;
+
+public class GCSPipesIterator extends PipesIterator implements Initializable {
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(GCSPipesIterator.class);
+    private String prefix = "";
+    private String projectId = "";
+    private String bucket;
+
+    private Storage storage;
+
+    @Field
+    public void setBucket(String bucket) {
+        this.bucket = bucket;
+    }
+
+    @Field
+    public void setPrefix(String prefix) {
+        this.prefix = prefix;
+    }
+
+    @Field
+    public void setProjectId(String projectId) {
+        this.projectId = projectId;
+    }
+
+    /**
+     * This initializes the gcs client.
+     *
+     * @param params params to use for initialization
+     * @throws TikaConfigException
+     */
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+        //TODO -- add other params to the builder as needed
+        storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        super.checkInitialization(problemHandler);
+        mustNotBeEmpty("bucket", this.bucket);
+        mustNotBeEmpty("projectId", this.projectId);
+    }
+
+    @Override
+    protected void enqueue() throws InterruptedException, IOException, TimeoutException {
+        String fetcherName = getFetcherName();
+        String emitterName = getEmitterName();
+        long start = System.currentTimeMillis();
+        int count = 0;
+        HandlerConfig handlerConfig = getHandlerConfig();
+
+        Page<Blob> blobs = null;
+        if (StringUtils.isBlank(prefix)) {
+            blobs = storage.list(bucket);
+        } else {
+            blobs = storage.list(bucket,
+                    Storage.BlobListOption.prefix(prefix));
+        }
+
+        for (Blob blob : blobs.iterateAll()) {
+            //I couldn't find a better way to skip directories
+            //calling blob.isDirectory() does not appear to work.  #usererror I'm sure.
+            if (blob.getSize() == 0) {
+                continue;
+            }
+            long elapsed = System.currentTimeMillis() - start;
+            LOGGER.debug("adding ({}) {} in {} ms", count, blob.getName(), elapsed);
+            //TODO -- allow user specified metadata as the "id"?
+            tryToAdd(new FetchEmitTuple(blob.getName(), new FetchKey(fetcherName,
+                    blob.getName()),
+                    new EmitKey(emitterName, blob.getName()), new Metadata(), handlerConfig,
+                    getOnParseException()));
+            count++;
+        }
+        long elapsed = System.currentTimeMillis() - start;
+        LOGGER.info("finished enqueuing {} files in {} ms", count, elapsed);
+    }
+}
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/test/java/org/apache/tika/pipes/pipesiterator/gcs/TestGCSPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/test/java/org/apache/tika/pipes/pipesiterator/gcs/TestGCSPipesIterator.java
new file mode 100644
index 0000000..5fa51ab
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/test/java/org/apache/tika/pipes/pipesiterator/gcs/TestGCSPipesIterator.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator.gcs;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+import org.apache.tika.pipes.pipesiterator.gcs.GCSPipesIterator;
+
+@Disabled("turn into an actual unit test")
+public class TestGCSPipesIterator {
+
+    @Test
+    public void testSimple() throws Exception {
+        GCSPipesIterator it = new GCSPipesIterator();
+        it.setFetcherName("gcs");
+        it.setBucket("tika-tallison-test-bucket");
+        it.setProjectId("My First Project");
+        it.setPrefix("pdfs");
+        it.initialize(Collections.EMPTY_MAP);
+        int numConsumers = 6;
+        ArrayBlockingQueue<FetchEmitTuple> queue = new ArrayBlockingQueue<>(10);
+
+        ExecutorService es = Executors.newFixedThreadPool(numConsumers + 1);
+        ExecutorCompletionService c = new ExecutorCompletionService(es);
+        List<MockFetcher> fetchers = new ArrayList<>();
+        for (int i = 0; i < numConsumers; i++) {
+            MockFetcher fetcher = new MockFetcher(queue);
+            fetchers.add(fetcher);
+            c.submit(fetcher);
+        }
+        for (FetchEmitTuple t : it) {
+            System.out.println(t);
+            queue.offer(t);
+        }
+        for (int i = 0; i < numConsumers; i++) {
+            queue.offer(PipesIterator.COMPLETED_SEMAPHORE);
+        }
+        int finished = 0;
+        int completed = 0;
+        try {
+            while (finished < numConsumers) {
+                Future<Integer> f = c.take();
+                completed += f.get();
+                finished++;
+            }
+        } finally {
+            es.shutdownNow();
+        }
+        assertEquals(2, completed);
+
+    }
+
+    private static class MockFetcher implements Callable<Integer> {
+        private final ArrayBlockingQueue<FetchEmitTuple> queue;
+        private final List<FetchEmitTuple> pairs = new ArrayList<>();
+
+        private MockFetcher(ArrayBlockingQueue<FetchEmitTuple> queue) {
+            this.queue = queue;
+        }
+
+        @Override
+        public Integer call() throws Exception {
+            while (true) {
+                FetchEmitTuple t = queue.poll(1, TimeUnit.HOURS);
+                if (t == PipesIterator.COMPLETED_SEMAPHORE) {
+                    return pairs.size();
+                }
+                pairs.add(t);
+            }
+        }
+    }
+}
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/test/resources/log4j.properties b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/test/resources/log4j.properties
new file mode 100644
index 0000000..2b2da1a
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/test/resources/log4j.properties
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#info,debug, error,fatal ...
+log4j.rootLogger=info,stderr
+#console
+log4j.appender.stderr=org.apache.log4j.ConsoleAppender
+log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
+log4j.appender.stderr.Target=System.err
+log4j.appender.stderr.layout.ConversionPattern=%-5p [%t]: %m%n

[tika] 01/03: TIKA-3510 -- further clean up

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5985b629df562b15ab39ca8a02332daf9936a383
Author: tallison <ta...@apache.org>
AuthorDate: Mon Aug 16 10:17:57 2021 -0400

    TIKA-3510 -- further clean up
---
 .../tika-parser-scientific-module/pom.xml          |  16 +-
 .../tika-parser-scientific-package/pom.xml         | 171 +--------------------
 .../tika-parser-sqlite3-module/pom.xml             |   2 -
 .../tika-parser-sqlite3-package/pom.xml            |  10 --
 4 files changed, 2 insertions(+), 197 deletions(-)

diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/pom.xml
index 4add4b4..92f9833 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/pom.xml
+++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/pom.xml
@@ -35,31 +35,26 @@
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-text-module</artifactId>
       <version>${project.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.sis.core</groupId>
       <artifactId>sis-utility</artifactId>
       <version>${sis.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.sis.storage</groupId>
       <artifactId>sis-netcdf</artifactId>
       <version>${sis.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.sis.core</groupId>
       <artifactId>sis-metadata</artifactId>
       <version>${sis.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.opengis</groupId>
       <artifactId>geoapi</artifactId>
       <version>${geoapi.version}</version>
-      <scope>provided</scope>
     </dependency>
     <!-- edu.ucar dependencies -->
     <dependency>
@@ -107,7 +102,6 @@
           <artifactId>slf4j-api</artifactId>
         </exclusion>
       </exclusions>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.quartz-scheduler</groupId>
@@ -119,7 +113,6 @@
           <artifactId>slf4j-api</artifactId>
         </exclusion>
       </exclusions>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.httpcomponents</groupId>
@@ -139,31 +132,26 @@
           <artifactId>httpcore</artifactId>
         </exclusion>
       </exclusions>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpcore</artifactId>
       <version>${httpcore.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
       <version>${commons.codec.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.jdom</groupId>
       <artifactId>jdom2</artifactId>
       <version>2.0.6</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
       <version>${guava.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>edu.ucar</groupId>
@@ -191,7 +179,6 @@
           <artifactId>slf4j-api</artifactId>
         </exclusion>
       </exclusions>
-      <scope>provided</scope>
     </dependency>
     <!-- TIKA-3095: Required for grib -->
     <dependency>
@@ -203,7 +190,6 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-csv</artifactId>
       <version>${commons.csv.version}</version>
-      <scope>provided</scope>
     </dependency>
     <!-- for java 10
 See TIKA-2778 for why we need to do this now.
@@ -213,7 +199,6 @@ May the gods of dependency management fix this in the future.
       <groupId>org.glassfish.jaxb</groupId>
       <artifactId>jaxb-runtime</artifactId>
       <version>${jaxb.version}</version>
-      <scope>provided</scope>
     </dependency>
   </dependencies>
 
@@ -233,6 +218,7 @@ May the gods of dependency management fix this in the future.
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
+        <version>${maven.jar.version}</version>
         <configuration>
           <archive>
             <manifestEntries>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/pom.xml
index df827ab..222ac12 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/pom.xml
+++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/pom.xml
@@ -38,176 +38,6 @@
       <artifactId>tika-parser-scientific-module</artifactId>
       <version>${project.version}</version>
     </dependency>
-      <!-- needed by AutoDetectReader in EnviHeaderParser -->
-      <dependency>
-        <groupId>${project.groupId}</groupId>
-        <artifactId>tika-parser-text-module</artifactId>
-        <version>${project.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.sis.core</groupId>
-        <artifactId>sis-utility</artifactId>
-        <version>${sis.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.sis.storage</groupId>
-        <artifactId>sis-netcdf</artifactId>
-        <version>${sis.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.sis.core</groupId>
-        <artifactId>sis-metadata</artifactId>
-        <version>${sis.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.opengis</groupId>
-        <artifactId>geoapi</artifactId>
-        <version>${geoapi.version}</version>
-      </dependency>
-      <!-- edu.ucar dependencies -->
-      <dependency>
-        <groupId>edu.ucar</groupId>
-        <artifactId>netcdf4</artifactId>
-        <version>${netcdf-java.version}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>commons-logging</groupId>
-            <artifactId>commons-logging</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.jdom</groupId>
-            <artifactId>jdom2</artifactId>
-          </exclusion>
-          <!--TIKA 2672 exclude jna to resolve the dependency convergence with tika-dl's
-          deeplearning4j-nn:1.0.0-SNAPSHOT-->
-          <exclusion>
-            <groupId>net.java.dev.jna</groupId>
-            <artifactId>jna</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.google.guava</groupId>
-            <artifactId>guava</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpcore</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpclient</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.quartz-scheduler</groupId>
-            <artifactId>quartz</artifactId>
-          </exclusion>
-          <exclusion>
-            <!-- dependency convergence problems -->
-            <groupId>org.slf4j</groupId>
-            <artifactId>jcl-over-slf4j</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.slf4j</groupId>
-            <artifactId>slf4j-api</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.quartz-scheduler</groupId>
-        <artifactId>quartz</artifactId>
-        <version>${quartz.version}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>org.slf4j</groupId>
-            <artifactId>slf4j-api</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.httpcomponents</groupId>
-        <artifactId>httpclient</artifactId>
-        <version>${httpcomponents.version}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>commons-logging</groupId>
-            <artifactId>commons-logging</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>commons-codec</groupId>
-            <artifactId>commons-codec</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpcore</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.httpcomponents</groupId>
-        <artifactId>httpcore</artifactId>
-        <version>${httpcore.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>commons-codec</groupId>
-        <artifactId>commons-codec</artifactId>
-        <version>${commons.codec.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>org.jdom</groupId>
-        <artifactId>jdom2</artifactId>
-        <version>2.0.6</version>
-      </dependency>
-      <dependency>
-        <groupId>com.google.guava</groupId>
-        <artifactId>guava</artifactId>
-        <version>${guava.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>edu.ucar</groupId>
-        <artifactId>grib</artifactId>
-        <version>${netcdf-java.version}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>edu.ucar</groupId>
-            <artifactId>jj2000</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.jsoup</groupId>
-            <artifactId>jsoup</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.jdom</groupId>
-            <artifactId>jdom2</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.google.protobuf</groupId>
-            <artifactId>protobuf-java</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>org.slf4j</groupId>
-            <artifactId>slf4j-api</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
-      <!-- TIKA-3095: Required for grib -->
-      <dependency>
-        <groupId>net.jcip</groupId>
-        <artifactId>jcip-annotations</artifactId>
-        <version>1.0</version>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.commons</groupId>
-        <artifactId>commons-csv</artifactId>
-        <version>${commons.csv.version}</version>
-      </dependency>
-      <!-- for java 10
-  See TIKA-2778 for why we need to do this now.
-  May the gods of dependency management fix this in the future.
-  -->
-      <dependency>
-        <groupId>org.glassfish.jaxb</groupId>
-        <artifactId>jaxb-runtime</artifactId>
-        <version>${jaxb.version}</version>
-      </dependency>
   </dependencies>
 
   <build>
@@ -231,6 +61,7 @@
         </executions>
       </plugin>
       <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
         <version>${maven.shade.version}</version>
         <executions>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/pom.xml
index b94bbf0..71fbe20 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/pom.xml
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/pom.xml
@@ -33,13 +33,11 @@
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-jdbc-commons</artifactId>
       <version>${project.version}</version>
-      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.xerial</groupId>
       <artifactId>sqlite-jdbc</artifactId>
       <version>${sqlite.version}</version>
-      <scope>provided</scope>
     </dependency>
   </dependencies>
   <build>
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package/pom.xml b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package/pom.xml
index 1a4a26c..1d0f8b2 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package/pom.xml
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package/pom.xml
@@ -35,16 +35,6 @@
       <artifactId>tika-parser-sqlite3-module</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-jdbc-commons</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.xerial</groupId>
-      <artifactId>sqlite-jdbc</artifactId>
-      <version>${sqlite.version}</version>
-    </dependency>
   </dependencies>
   <build>
     <plugins>