You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/25 21:04:15 UTC
[tika] branch main updated: TIKA-3707 -- allow sending in the full sasurl per blob
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f968b9d TIKA-3707 -- allow sending in the full sasurl per blob
f968b9d is described below
commit f968b9dfbec52f3ff0973ba08e1f996b0a2c5182
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 25 17:03:56 2022 -0400
TIKA-3707 -- allow sending in the full sasurl per blob
---
.../tika/pipes/fetcher/azblob/AZBlobFetcher.java | 68 ++++++++++++++++++----
.../src/test/resources/tika-config-az-blob.xml | 4 +-
2 files changed, 59 insertions(+), 13 deletions(-)
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java
index 5ecb6d9..dee9030 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java
@@ -26,6 +26,7 @@ import java.nio.file.Path;
import java.util.Map;
import com.azure.storage.blob.BlobClient;
+import com.azure.storage.blob.BlobClientBuilder;
import com.azure.storage.blob.BlobContainerClient;
import com.azure.storage.blob.BlobServiceClient;
import com.azure.storage.blob.BlobServiceClientBuilder;
@@ -43,9 +44,17 @@ import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.apache.tika.utils.StringUtils;
/**
- * Fetches files from Azure blob storage. Must set endpoint, sasToken and container via config.
+ * Fetches files from Azure blob storage.
+ *
+ * There are two modes:
+ * 1) If you are only using one endpoint and one sas token and one container,
+ * configure those in the config file. In this case, your fetchKey will
+ * be the path in the container to the blob.
+ * 2) If you have different endpoints or sas tokens or containers across
+ * your requests, your fetchKey will be the complete SAS url pointing to the blob.
*/
public class AZBlobFetcher extends AbstractFetcher implements Initializable {
@@ -54,6 +63,7 @@ public class AZBlobFetcher extends AbstractFetcher implements Initializable {
private String sasToken;
private String container;
private String endpoint;
+ private BlobClientFactory blobClientFactory;
private boolean extractUserMetadata = true;
private BlobServiceClient blobServiceClient;
private BlobContainerClient blobContainerClient;
@@ -65,8 +75,7 @@ public class AZBlobFetcher extends AbstractFetcher implements Initializable {
LOGGER.debug("about to fetch fetchkey={} from endpoint ({})", fetchKey, endpoint);
try {
- BlobClient blobClient = blobContainerClient.getBlobClient(fetchKey);
- //TODO: extract other metadata, eg. md5, crc, etc.
+ BlobClient blobClient = blobClientFactory.getClient(fetchKey);
if (extractUserMetadata) {
BlobProperties properties = blobClient.getProperties();
@@ -133,19 +142,54 @@ public class AZBlobFetcher extends AbstractFetcher implements Initializable {
*/
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- //TODO -- allow authentication via other methods
- blobServiceClient = new BlobServiceClientBuilder()
- .endpoint(endpoint)
- .sasToken(sasToken)
- .buildClient();
- blobContainerClient = blobServiceClient.getBlobContainerClient(container);
+ if (!StringUtils.isBlank(sasToken)) {
+ LOGGER.debug("Setting up immutable endpoint, token and container");
+ blobClientFactory = new SingleBlobContainerFactory(endpoint, sasToken, container);
+ } else {
+ LOGGER.debug("Setting up blobclientfactory to recieve the full sas url for the blob");
+ blobClientFactory = new SASUrlFactory();
+ }
}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
- mustNotBeEmpty("sasToken", this.sasToken);
- mustNotBeEmpty("endpoint", this.endpoint);
- mustNotBeEmpty("container", this.container);
+ //if the user has set one of these, they need to have set all of them
+ if (!StringUtils.isBlank(this.sasToken) ||
+ !StringUtils.isBlank(this.endpoint) || !StringUtils.isBlank(this.container)) {
+ mustNotBeEmpty("sasToken", this.sasToken);
+ mustNotBeEmpty("endpoint", this.endpoint);
+ mustNotBeEmpty("container", this.container);
+ }
+ }
+
+ private interface BlobClientFactory {
+ BlobClient getClient(String fetchKey);
+ }
+
+ private static class SingleBlobContainerFactory implements BlobClientFactory {
+ private final BlobContainerClient blobContainerClient;
+
+ private SingleBlobContainerFactory(String endpoint, String sasToken, String container) {
+ //TODO -- allow authentication via other methods
+ BlobServiceClient blobServiceClient = new BlobServiceClientBuilder()
+ .endpoint(endpoint)
+ .sasToken(sasToken)
+ .buildClient();
+ blobContainerClient = blobServiceClient.getBlobContainerClient(container);
+ }
+
+ @Override
+ public BlobClient getClient(String fetchKey) {
+ return blobContainerClient.getBlobClient(fetchKey);
+ }
+ }
+
+ private static class SASUrlFactory implements BlobClientFactory {
+
+ @Override
+ public BlobClient getClient(String fetchKey) {
+ return new BlobClientBuilder().connectionString(fetchKey).buildClient();
+ }
}
}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml
index 94dd697..e2d74b2 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/resources/tika-config-az-blob.xml
@@ -20,7 +20,9 @@
<fetcher class="org.apache.tika.pipes.fetcher.azblob.AZBlobFetcher">
<params>
<name>az-blob</name>
- <!-- these have to be non-null -->
+ <!-- Either configure these three and send in the path for the blob OR
+ do not configure any of these and send in the full SAS url for the blob
+ as the fetchkey-->
<endpoint></endpoint>
<container></container>
<sasToken></sasToken>