You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/25 21:04:15 UTC

[tika] branch main updated: TIKA-3707 -- allow sending in the full sasurl per blob

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new f968b9d  TIKA-3707 -- allow sending in the full sasurl per blob
f968b9d is described below

commit f968b9dfbec52f3ff0973ba08e1f996b0a2c5182
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 25 17:03:56 2022 -0400

    TIKA-3707 -- allow sending in the full sasurl per blob
---
 .../tika/pipes/fetcher/azblob/AZBlobFetcher.java   | 68 ++++++++++++++++++----
 .../src/test/resources/tika-config-az-blob.xml     |  4 +-
 2 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java
index 5ecb6d9..dee9030 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java
@@ -26,6 +26,7 @@ import java.nio.file.Path;
 import java.util.Map;
 
 import com.azure.storage.blob.BlobClient;
+import com.azure.storage.blob.BlobClientBuilder;
 import com.azure.storage.blob.BlobContainerClient;
 import com.azure.storage.blob.BlobServiceClient;
 import com.azure.storage.blob.BlobServiceClientBuilder;
@@ -43,9 +44,17 @@ import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.apache.tika.utils.StringUtils;
 
 /**
- * Fetches files from Azure blob storage. Must set endpoint, sasToken and container via config.
+ * Fetches files from Azure blob storage.
+ *
+ * There are two modes:
+ * 1) If you are only using one endpoint and one sas token and one container,
+ *    configure those in the config file.  In this case, your fetchKey will
+ *    be the path in the container to the blob.
+ * 2) If you have different endpoints or sas tokens or containers across
+ *    your requests, your fetchKey will be the complete SAS url pointing to the blob.
  */
 public class AZBlobFetcher extends AbstractFetcher implements Initializable {
 
@@ -54,6 +63,7 @@ public class AZBlobFetcher extends AbstractFetcher implements Initializable {
     private String sasToken;
     private String container;
     private String endpoint;
+    private BlobClientFactory blobClientFactory;
     private boolean extractUserMetadata = true;
     private BlobServiceClient blobServiceClient;
     private BlobContainerClient blobContainerClient;
@@ -65,8 +75,7 @@ public class AZBlobFetcher extends AbstractFetcher implements Initializable {
         LOGGER.debug("about to fetch fetchkey={} from endpoint ({})", fetchKey, endpoint);
 
         try {
-            BlobClient blobClient = blobContainerClient.getBlobClient(fetchKey);
-            //TODO: extract other metadata, eg. md5, crc, etc.
+            BlobClient blobClient = blobClientFactory.getClient(fetchKey);
 
             if (extractUserMetadata) {
                 BlobProperties properties = blobClient.getProperties();
@@ -133,19 +142,54 @@ public class AZBlobFetcher extends AbstractFetcher implements Initializable {
      */
     @Override
     public void initialize(Map<String, Param> params) throws TikaConfigException {
-        //TODO -- allow authentication via other methods
-        blobServiceClient = new BlobServiceClientBuilder()
-                .endpoint(endpoint)
-                .sasToken(sasToken)
-                .buildClient();
-        blobContainerClient = blobServiceClient.getBlobContainerClient(container);
+        if (!StringUtils.isBlank(sasToken)) {
+            LOGGER.debug("Setting up immutable endpoint, token and container");
+            blobClientFactory = new SingleBlobContainerFactory(endpoint, sasToken, container);
+        } else {
+            LOGGER.debug("Setting up blobclientfactory to recieve the full sas url for the blob");
+            blobClientFactory = new SASUrlFactory();
+        }
     }
 
     @Override
     public void checkInitialization(InitializableProblemHandler problemHandler)
             throws TikaConfigException {
-        mustNotBeEmpty("sasToken", this.sasToken);
-        mustNotBeEmpty("endpoint", this.endpoint);
-        mustNotBeEmpty("container", this.container);
+        //if the user has set one of these, they need to have set all of them
+        if (!StringUtils.isBlank(this.sasToken) ||
+                !StringUtils.isBlank(this.endpoint) || !StringUtils.isBlank(this.container)) {
+            mustNotBeEmpty("sasToken", this.sasToken);
+            mustNotBeEmpty("endpoint", this.endpoint);
+            mustNotBeEmpty("container", this.container);
+        }
+    }
+
+    private interface BlobClientFactory {
+        BlobClient getClient(String fetchKey);
+    }
+
+    private static class SingleBlobContainerFactory implements BlobClientFactory {
+        private final BlobContainerClient blobContainerClient;
+
+        private SingleBlobContainerFactory(String endpoint, String sasToken, String container) {
+            //TODO -- allow authentication via other methods
+            BlobServiceClient blobServiceClient = new BlobServiceClientBuilder()
+                    .endpoint(endpoint)
+                    .sasToken(sasToken)
+                    .buildClient();
+            blobContainerClient = blobServiceClient.getBlobContainerClient(container);
+        }
+
+        @Override
+        public BlobClient getClient(String fetchKey) {
+            return blobContainerClient.getBlobClient(fetchKey);
+        }
+    }
+
+    private static class SASUrlFactory implements BlobClientFactory {
+
+        @Override
+        public BlobClient getClient(String fetchKey) {
+            return new BlobClientBuilder().connectionString(fetchKey).buildClient();
+        }
     }
 }
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml
index 94dd697..e2d74b2 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml
@@ -20,7 +20,9 @@
         <fetcher class="org.apache.tika.pipes.fetcher.azblob.AZBlobFetcher">
             <params>
                 <name>az-blob</name>
-                <!-- these have to be non-null -->
+                <!-- Either configure these three and send in the path for the blob OR
+                     do not configure any of these and send in the full SAS url for the blob
+                     as the fetchkey-->
                 <endpoint></endpoint>
                 <container></container>
                 <sasToken></sasToken>