You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by nd...@apache.org on 2024/03/28 09:06:04 UTC

(tika) 01/01: TIKA-4229

This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a commit to branch TIKA-4229-add-microsoft-graph-fetcher
in repository https://gitbox.apache.org/repos/asf/tika.git

commit eb4e0c12fbec2772a0ba406d2ab8ac6200b0148a
Author: Nicholas DiPiazza <nd...@apache.org>
AuthorDate: Thu Mar 28 04:04:33 2024 -0500

    TIKA-4229
    
    initial attempt to add microsoft graph fetcher
---
 .../tika/pipes/fetcher/config/AbstractConfig.java  |   4 +
 tika-pipes/tika-fetchers/pom.xml                   |   1 +
 .../tika-fetcher-microsoft-graph/pom.xml           | 151 +++++++++++++++++++++
 .../microsoftgraph/MicrosoftGraphFetcher.java      | 140 +++++++++++++++++++
 .../config/AadCredentialConfigBase.java            |  40 ++++++
 .../Client2CertificateCredentialsConfig.java       |  50 +++++++
 .../config/ClientCertificateCredentialsConfig.java |  40 ++++++
 .../config/ClientSecretCredentialsConfig.java      |  30 ++++
 .../config/MsGraphFetcherConfig.java               |  65 +++++++++
 .../microsoftgraph/MicrosoftGraphFetcherTest.java  | 100 ++++++++++++++
 .../src/test/resources/log4j2.xml                  |  32 +++++
 11 files changed, 653 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java
new file mode 100644
index 000000000..536fc44b1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java
@@ -0,0 +1,4 @@
+package org.apache.tika.pipes.fetcher.config;
+
+public abstract class AbstractConfig {
+}
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 7830a74d6..8b957e8cf 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -36,6 +36,7 @@
     <module>tika-fetcher-s3</module>
     <module>tika-fetcher-gcs</module>
     <module>tika-fetcher-az-blob</module>
+    <module>tika-fetcher-microsoft-graph</module>
   </modules>
 
   <dependencies>
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml
new file mode 100644
index 000000000..e40c8354f
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml
@@ -0,0 +1,151 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>tika-fetchers</artifactId>
+        <groupId>org.apache.tika</groupId>
+        <version>3.0.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>tika-fetcher-microsoft-graph</artifactId>
+    <name>Microsoft Graph Tika Pipes Fetcher</name>
+
+    <properties>
+        <maven.compiler.source>11</maven.compiler.source>
+        <maven.compiler.target>11</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <azure-identity.version>1.11.0</azure-identity.version>
+        <microsoft-graph.version>6.4.0</microsoft-graph.version>
+        <microsoft-kiota-serialization-json.version>1.1.1</microsoft-kiota-serialization-json.version>
+        <junit-jupiter-engine.version>5.9.2</junit-jupiter-engine.version>
+        <wiremock.version>3.3.1</wiremock.version>
+        <mockito-junit-jupiter.version>5.3.1</mockito-junit-jupiter.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.microsoft.graph</groupId>
+            <artifactId>microsoft-graph</artifactId>
+            <version>${microsoft-graph.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.azure</groupId>
+            <artifactId>azure-identity</artifactId>
+            <version>${azure-identity.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-engine</artifactId>
+            <version>${junit-jupiter-engine.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-junit-jupiter</artifactId>
+            <version>${mockito-junit-jupiter.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+                        <manifestEntries>
+                            <Automatic-Module-Name>org.apache.tika.pipes.fetcher.s3</Automatic-Module-Name>
+                        </manifestEntries>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>${maven.shade.version}</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*</exclude>
+                                        <exclude>LICENSE.txt</exclude>
+                                        <exclude>NOTICE.txt</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/LICENSE</resource>
+                                    <file>target/classes/META-INF/LICENSE</file>
+                                </transformer>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/NOTICE</resource>
+                                    <file>target/classes/META-INF/NOTICE</file>
+                                </transformer>
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
+                                    <resource>META-INF/DEPENDENCIES</resource>
+                                    <file>target/classes/META-INF/DEPENDENCIES</file>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+
+    <scm>
+        <tag>3.0.0-BETA-rc1</tag>
+    </scm>
+</project>
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java
new file mode 100644
index 000000000..771790692
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph;
+
+import com.azure.identity.ClientCertificateCredentialBuilder;
+import com.azure.identity.ClientSecretCredentialBuilder;
+import com.microsoft.graph.serviceclient.GraphServiceClient;
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.apache.tika.pipes.fetchers.microsoftgraph.config.ClientCertificateCredentialsConfig;
+import org.apache.tika.pipes.fetchers.microsoftgraph.config.ClientSecretCredentialsConfig;
+import org.apache.tika.pipes.fetchers.microsoftgraph.config.MsGraphFetcherConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+
+/**
+ * Fetches files from Microsoft Graph API.
+ * Fetch keys are ${siteDriveId},${driveItemId}
+ */
+public class MicrosoftGraphFetcher extends AbstractFetcher implements Initializable {
+    private static final Logger LOGGER = LoggerFactory.getLogger(MicrosoftGraphFetcher.class);
+    private GraphServiceClient graphClient;
+    private MsGraphFetcherConfig msGraphFetcherConfig;
+    private long[] throttleSeconds;
+
+    public MicrosoftGraphFetcher() {
+
+    }
+
+    public MicrosoftGraphFetcher(MsGraphFetcherConfig msGraphFetcherConfig) {
+        this.msGraphFetcherConfig = msGraphFetcherConfig;
+    }
+
+    /**
+     * Set seconds to throttle retries as a comma-delimited list, e.g.: 30,60,120,600
+     * @param commaDelimitedLongs
+     * @throws TikaConfigException
+     */
+    @Field
+    public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException {
+        String[] longStrings = commaDelimitedLongs.split(",");
+        long[] seconds = new long[longStrings.length];
+        for (int i = 0; i < longStrings.length; i++) {
+            try {
+                seconds[i] = Long.parseLong(longStrings[i]);
+            } catch (NumberFormatException e) {
+                throw new TikaConfigException(e.getMessage());
+            }
+        }
+        setThrottleSeconds(seconds);
+    }
+    public void setThrottleSeconds(long[] throttleSeconds) {
+        this.throttleSeconds = throttleSeconds;
+    }
+
+    @Override
+    public void initialize(Map<String, Param> map) {
+        String[] scopes = msGraphFetcherConfig.getScopes().toArray(new String[0]);
+        if (msGraphFetcherConfig.getCredentials() instanceof ClientCertificateCredentialsConfig) {
+            ClientCertificateCredentialsConfig credentials = (ClientCertificateCredentialsConfig) msGraphFetcherConfig.getCredentials();
+            graphClient = new GraphServiceClient(new ClientCertificateCredentialBuilder()
+                    .clientId(credentials.getClientId())
+                    .tenantId(credentials.getTenantId())
+                    .pfxCertificate(new ByteArrayInputStream(credentials.getCertificateBytes()))
+                    .clientCertificatePassword(credentials.getCertificatePassword())
+                    .build(), scopes);
+        } else if (msGraphFetcherConfig.getCredentials() instanceof ClientSecretCredentialsConfig) {
+            ClientSecretCredentialsConfig credentials =
+                    (ClientSecretCredentialsConfig) msGraphFetcherConfig.getCredentials();
+            graphClient = new GraphServiceClient(
+                    new ClientSecretCredentialBuilder()
+                            .tenantId(credentials.getTenantId())
+                            .clientId(credentials.getClientId())
+                            .clientSecret(credentials.getClientSecret()).build(), scopes);
+        }
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler initializableProblemHandler)
+            throws TikaConfigException {
+    }
+
+    @Override
+    public InputStream fetch(String fetchKey, Metadata metadata) throws TikaException, IOException {
+        int tries = 0;
+        Exception ex;
+        do {
+            try {
+                long start = System.currentTimeMillis();
+                String[] fetchKeySplit = fetchKey.split(",");
+                String siteDriveId = fetchKeySplit[0];
+                String driveItemId = fetchKeySplit[1];
+                InputStream is = graphClient.drives().byDriveId(siteDriveId)
+                        .items()
+                        .byDriveItemId(driveItemId)
+                        .content()
+                        .get();
+
+                long elapsed = System.currentTimeMillis() - start;
+                LOGGER.debug("Total to fetch {}", elapsed);
+                return is;
+            } catch (Exception e) {
+                LOGGER.warn("Exception fetching on retry=" + tries, e);
+                ex = e;
+            }
+            LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]);
+            try {
+                Thread.sleep(throttleSeconds[tries]);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        } while (++tries < throttleSeconds.length);
+        throw new TikaException("Could not parse " + fetchKey, ex);
+    }
+}
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/AadCredentialConfigBase.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/AadCredentialConfigBase.java
new file mode 100644
index 000000000..e4204739c
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/AadCredentialConfigBase.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph.config;
+
+public abstract class AadCredentialConfigBase {
+    private String tenantId;
+    private String clientId;
+
+    public String getTenantId() {
+        return tenantId;
+    }
+
+    public AadCredentialConfigBase setTenantId(String tenantId) {
+        this.tenantId = tenantId;
+        return this;
+    }
+
+    public String getClientId() {
+        return clientId;
+    }
+
+    public AadCredentialConfigBase setClientId(String clientId) {
+        this.clientId = clientId;
+        return this;
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/Client2CertificateCredentialsConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/Client2CertificateCredentialsConfig.java
new file mode 100644
index 000000000..d9128373e
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/Client2CertificateCredentialsConfig.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph.config;
+
+public class Client2CertificateCredentialsConfig {
+    private String tenantId;
+    private String clientId;
+    private String clientSecret;
+
+    public String getTenantId() {
+        return tenantId;
+    }
+
+    public Client2CertificateCredentialsConfig setTenantId(String tenantId) {
+        this.tenantId = tenantId;
+        return this;
+    }
+
+    public String getClientId() {
+        return clientId;
+    }
+
+    public Client2CertificateCredentialsConfig setClientId(String clientId) {
+        this.clientId = clientId;
+        return this;
+    }
+
+    public String getClientSecret() {
+        return clientSecret;
+    }
+
+    public Client2CertificateCredentialsConfig setClientSecret(String clientSecret) {
+        this.clientSecret = clientSecret;
+        return this;
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/ClientCertificateCredentialsConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/ClientCertificateCredentialsConfig.java
new file mode 100644
index 000000000..2927519f1
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/ClientCertificateCredentialsConfig.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph.config;
+
+public class ClientCertificateCredentialsConfig extends AadCredentialConfigBase {
+    private byte[] certificateBytes;
+    private String certificatePassword;
+
+    public byte[] getCertificateBytes() {
+        return certificateBytes;
+    }
+
+    public ClientCertificateCredentialsConfig setCertificateBytes(byte[] certificateBytes) {
+        this.certificateBytes = certificateBytes;
+        return this;
+    }
+
+    public String getCertificatePassword() {
+        return certificatePassword;
+    }
+
+    public ClientCertificateCredentialsConfig setCertificatePassword(String certificatePassword) {
+        this.certificatePassword = certificatePassword;
+        return this;
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/ClientSecretCredentialsConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/ClientSecretCredentialsConfig.java
new file mode 100644
index 000000000..2989af941
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/ClientSecretCredentialsConfig.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph.config;
+
+public class ClientSecretCredentialsConfig extends AadCredentialConfigBase {
+    private String clientSecret;
+
+    public String getClientSecret() {
+        return clientSecret;
+    }
+
+    public ClientSecretCredentialsConfig setClientSecret(String clientSecret) {
+        this.clientSecret = clientSecret;
+        return this;
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MsGraphFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MsGraphFetcherConfig.java
new file mode 100644
index 000000000..46e365893
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MsGraphFetcherConfig.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph.config;
+
+import org.apache.tika.pipes.fetcher.config.AbstractConfig;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class MsGraphFetcherConfig extends AbstractConfig {
+    private long[] throttleSeconds;
+    private boolean spoolToTemp;
+    private AadCredentialConfigBase credentials;
+
+    private List<String> scopes = new ArrayList<>();
+    public boolean isSpoolToTemp() {
+        return spoolToTemp;
+    }
+
+    public MsGraphFetcherConfig setSpoolToTemp(boolean spoolToTemp) {
+        this.spoolToTemp = spoolToTemp;
+        return this;
+    }
+
+    public long[] getThrottleSeconds() {
+        return throttleSeconds;
+    }
+
+    public MsGraphFetcherConfig setThrottleSeconds(long[] throttleSeconds) {
+        this.throttleSeconds = throttleSeconds;
+        return this;
+    }
+
+    public AadCredentialConfigBase getCredentials() {
+        return credentials;
+    }
+
+    public MsGraphFetcherConfig setCredentials(AadCredentialConfigBase credentials) {
+        this.credentials = credentials;
+        return this;
+    }
+
+    public List<String> getScopes() {
+        return scopes;
+    }
+
+    public MsGraphFetcherConfig setScopes(List<String> scopes) {
+        this.scopes = scopes;
+        return this;
+    }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcherTest.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcherTest.java
new file mode 100644
index 000000000..059a93265
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcherTest.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.microsoftgraph;
+
+import com.microsoft.graph.drives.DrivesRequestBuilder;
+import com.microsoft.graph.drives.item.DriveItemRequestBuilder;
+import com.microsoft.graph.drives.item.items.ItemsRequestBuilder;
+import com.microsoft.graph.drives.item.items.item.DriveItemItemRequestBuilder;
+import com.microsoft.graph.drives.item.items.item.content.ContentRequestBuilder;
+import com.microsoft.graph.serviceclient.GraphServiceClient;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetchers.microsoftgraph.config.ClientCertificateCredentialsConfig;
+import org.apache.tika.pipes.fetchers.microsoftgraph.config.MsGraphFetcherConfig;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.InjectMocks;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+import org.mockito.Spy;
+import org.mockito.junit.jupiter.MockitoExtension;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+
+@ExtendWith(MockitoExtension.class)
+class MicrosoftGraphFetcherTest {
+    private static final Logger LOGGER = LoggerFactory.getLogger(MicrosoftGraphFetcherTest.class);
+    static byte[] certificateBytes = "test cert file here".getBytes(StandardCharsets.UTF_8);
+    static String certificatePassword = "somepasswordhere";
+    static String clientId = "12312312-1234-1234-1234-112312312313";
+    static String tenantId = "32132132-4332-5432-4321-121231231232";
+    static String siteDriveId = "99999999-1234-1111-1111-12312312312";
+    static String driveItemid = "asfsadfsadfsafdusahdfiuhfdsusadfjuafiagfaigf";
+
+    @Mock
+    GraphServiceClient graphClient;
+    @Spy
+    @SuppressWarnings("unused")
+    MsGraphFetcherConfig msGraphFetcherConfig = new MsGraphFetcherConfig()
+            .setCredentials(new ClientCertificateCredentialsConfig()
+                    .setCertificateBytes(certificateBytes)
+                    .setCertificatePassword(certificatePassword)
+                    .setClientId(clientId)
+                    .setTenantId(tenantId))
+            .setScopes(Collections.singletonList(".default"));
+
+    @Mock
+    DrivesRequestBuilder drivesRequestBuilder;
+
+    @Mock
+    DriveItemRequestBuilder driveItemRequestBuilder;
+
+    @Mock
+    ItemsRequestBuilder itemsRequestBuilder;
+
+    @Mock
+    DriveItemItemRequestBuilder driveItemItemRequestBuilder;
+
+    @Mock
+    ContentRequestBuilder contentRequestBuilder;
+
+    @InjectMocks
+    MicrosoftGraphFetcher microsoftGraphFetcher;
+
+    @Test
+    void fetch() throws Exception {
+        try (AutoCloseable ignored = MockitoAnnotations.openMocks(this)) {
+            Mockito.when(graphClient.drives()).thenReturn(drivesRequestBuilder);
+            Mockito.when(drivesRequestBuilder.byDriveId(siteDriveId)).thenReturn(driveItemRequestBuilder);
+            Mockito.when(driveItemRequestBuilder.items()).thenReturn(itemsRequestBuilder);
+            Mockito.when(itemsRequestBuilder.byDriveItemId(driveItemid)).thenReturn(driveItemItemRequestBuilder);
+            Mockito.when(driveItemItemRequestBuilder.content()).thenReturn(contentRequestBuilder);
+            String content = "content";
+            Mockito.when(contentRequestBuilder.get()).thenReturn(new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)));
+            InputStream resultingInputStream = microsoftGraphFetcher.fetch(siteDriveId + "," + driveItemid, new Metadata());
+            Assertions.assertEquals(content, IOUtils.toString(resultingInputStream, StandardCharsets.UTF_8));
+        }
+    }
+}
\ No newline at end of file
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/resources/log4j2.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/resources/log4j2.xml
new file mode 100644
index 000000000..c88e66e99
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/test/resources/log4j2.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<Configuration status="WARN">
+  <Appenders>
+    <Console name="Console" target="SYSTEM_ERR">
+      <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
+    </Console>
+  </Appenders>
+  <Loggers>
+    <Root level="info">
+      <AppenderRef ref="Console"/>
+    </Root>
+  </Loggers>
+</Configuration>
\ No newline at end of file