You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/10/26 17:26:50 UTC

[tika] branch main updated (6ea0106 -> 452aa91)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 6ea0106  TIKA-3583 -- align default timeouts between default config, hard coded default and documentation
     new 997757c  rename static variables to all caps
     new cd0c3ad  fix spellings after uppercasing
     new 452aa91  TIKA-3582 -- add parse timeouts per task in /rmeta and /tika. This timeout cannot be greater than taskTimeoutMillis.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  6 ++-
 .../org/apache/tika/config/TikaTaskTimeout.java    | 28 ++++++-----
 .../tika/parser/external2/ExternalParser.java      |  5 +-
 .../src/test/java/org/apache/tika/TikaTest.java    |  7 +++
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 25 +++++++---
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 19 ++++++++
 ...ika-config-psm0.xml => TIKA-3582-tesseract.xml} |  2 +-
 .../org/apache/tika/server/core/ServerStatus.java  |  6 ++-
 .../tika/server/core/ServerStatusWatcher.java      | 16 ++++---
 .../org/apache/tika/server/core/TaskStatus.java    |  8 ++--
 .../apache/tika/server/core/TikaServerProcess.java |  6 ++-
 .../TimeoutConfig.java}                            | 21 ++++----
 .../server/core/resource/DetectorResource.java     |  6 ++-
 .../tika/server/core/resource/TikaResource.java    | 56 +++++++++++++++-------
 .../server/core/resource/TranslateResource.java    |  7 ++-
 .../org.apache.tika.server.core.ParseContextConfig |  3 +-
 .../apache/tika/server/core/ServerStatusTest.java  |  2 +-
 .../server/core/TikaServerIntegrationTest.java     | 26 ++++++++++
 .../tika/server/core/TranslateResourceTest.java    |  3 +-
 19 files changed, 179 insertions(+), 73 deletions(-)
 copy tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/TextAndAttributeXMLParser.java => tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java (59%)
 copy tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/resources/test-configs/{tika-config-psm0.xml => TIKA-3582-tesseract.xml} (94%)
 copy tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/{CompositeParseContextConfig.java => config/TimeoutConfig.java} (67%)

[tika] 01/03: rename static variables to all caps

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 997757c04b1377fceeee055ffe7f951c7fabb5b2
Author: tballison <ta...@apache.org>
AuthorDate: Tue Oct 26 11:15:48 2021 -0400

    rename static variables to all caps
---
 .../tika/server/core/resource/TikaResource.java    | 30 +++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index b8042c7..220811d 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -93,10 +93,10 @@ public class TikaResource {
     private static final String META_PREFIX = "meta_";
     private static final Logger LOG = LoggerFactory.getLogger(TikaResource.class);
     private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_+\\.A-Z0-9 ]+$");
-    private static TikaConfig tikaConfig;
-    private static TikaServerConfig tikaServerConfig;
-    private static DigestingParser.Digester digester = null;
-    private static InputStreamFactory inputStreamFactory = null;
+    private static TikaConfig TIKA_CONFIG;
+    private static TikaServerConfig TIKA_SERVER_CONFIG;
+    private static DigestingParser.Digester DIGESTER = null;
+    private static InputStreamFactory INPUTSTREAM_FACTORY = null;
     private static ServerStatus SERVER_STATUS = null;
 
     private static ParseContextConfig PARSE_CONTEXT_CONFIG = new CompositeParseContextConfig();
@@ -105,26 +105,26 @@ public class TikaResource {
     public static void init(TikaConfig config, TikaServerConfig tikaServerConfg,
                             DigestingParser.Digester digestr,
                             InputStreamFactory iSF, ServerStatus serverStatus) {
-        tikaConfig = config;
-        tikaServerConfig = tikaServerConfg;
-        digester = digestr;
-        inputStreamFactory = iSF;
+        TIKA_CONFIG = config;
+        TIKA_SERVER_CONFIG = tikaServerConfg;
+        DIGESTER = digestr;
+        INPUTSTREAM_FACTORY = iSF;
         SERVER_STATUS = serverStatus;
     }
 
 
     @SuppressWarnings("serial")
     public static Parser createParser() {
-        final Parser parser = new AutoDetectParser(tikaConfig);
+        final Parser parser = new AutoDetectParser(TIKA_CONFIG);
 
-        if (digester != null) {
-            return new DigestingParser(parser, digester);
+        if (DIGESTER != null) {
+            return new DigestingParser(parser, DIGESTER);
         }
         return parser;
     }
 
     public static TikaConfig getConfig() {
-        return tikaConfig;
+        return TIKA_CONFIG;
     }
 
     public static String detectFilename(MultivaluedMap<String, String> httpHeaders) {
@@ -154,7 +154,7 @@ public class TikaResource {
     public static InputStream getInputStream(InputStream is, Metadata metadata,
                                              HttpHeaders headers) {
         try {
-            return inputStreamFactory.getInputStream(is, metadata, headers);
+            return INPUTSTREAM_FACTORY.getInputStream(is, metadata, headers);
         } catch (IOException e) {
             throw new TikaServerParseException(e);
         }
@@ -576,7 +576,7 @@ public class TikaResource {
                 metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
                 writeLimitReached = true;
             }
-            if (tikaServerConfig.isReturnStackTrace()) {
+            if (TIKA_SERVER_CONFIG.isReturnStackTrace()) {
                 if (cause != null) {
                     metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
                             ExceptionUtils.getStackTrace(cause));
@@ -588,7 +588,7 @@ public class TikaResource {
                 throw e;
             }
         } catch (OutOfMemoryError e) {
-            if (tikaServerConfig.isReturnStackTrace()) {
+            if (TIKA_SERVER_CONFIG.isReturnStackTrace()) {
                 metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
                         ExceptionUtils.getStackTrace(e));
             } else {

[tika] 03/03: TIKA-3582 -- add parse timeouts per task in /rmeta and /tika. This timeout cannot be greater than taskTimeoutMillis.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 452aa916b5eee1fea176211e6b5711e06412a50f
Author: tballison <ta...@apache.org>
AuthorDate: Tue Oct 26 13:26:44 2021 -0400

    TIKA-3582 -- add parse timeouts per task in /rmeta and /tika. This timeout cannot be greater than taskTimeoutMillis.
---
 CHANGES.txt                                        |  6 ++++-
 .../org/apache/tika/config/TikaTaskTimeout.java    | 31 +++++++++++-----------
 .../tika/parser/external2/ExternalParser.java      |  5 ++--
 .../src/test/java/org/apache/tika/TikaTest.java    |  7 +++++
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 25 ++++++++++++-----
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 19 +++++++++++++
 .../resources/test-configs/TIKA-3582-tesseract.xml | 29 ++++++++++++++++++++
 .../org/apache/tika/server/core/ServerStatus.java  |  6 +++--
 .../tika/server/core/ServerStatusWatcher.java      | 16 ++++++-----
 .../org/apache/tika/server/core/TaskStatus.java    |  8 +++---
 .../apache/tika/server/core/TikaServerProcess.java |  6 +++--
 .../{TaskStatus.java => config/TimeoutConfig.java} | 29 ++++++++++----------
 .../server/core/resource/DetectorResource.java     |  6 ++++-
 .../tika/server/core/resource/TikaResource.java    | 21 ++++++++++++++-
 .../server/core/resource/TranslateResource.java    |  7 +++--
 .../org.apache.tika.server.core.ParseContextConfig |  3 ++-
 .../apache/tika/server/core/ServerStatusTest.java  |  2 +-
 .../server/core/TikaServerIntegrationTest.java     | 26 ++++++++++++++++++
 .../tika/server/core/TranslateResourceTest.java    |  3 ++-
 19 files changed, 194 insertions(+), 61 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 846ca6e..368faca 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,10 @@
 Release 2.1.1 - ???
 
-   * Add metadata item whether or not a PDF has a collection/
+   * Add timeout per task to be configured via headers
+     for tika-server's legacy endpoints /tika and /rmeta.
+     Note that this timeout greater than taskTimeoutMillis (TIKA-3582).
+
+   * Add metadata item for whether or not a PDF has a collection/
      is a Portfolio PDF (TIKA-3579).
 
    * Add detection of ESRI Layer files (TIKA-3570).
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java b/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java
similarity index 56%
copy from tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
copy to tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java
index 07d1373..821be35 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java
@@ -14,26 +14,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.server.core;
+package org.apache.tika.config;
 
-import java.time.Instant;
-import java.util.Optional;
+import org.apache.tika.parser.ParseContext;
 
-public class TaskStatus {
-    final ServerStatus.TASK task;
-    final Instant started;
-    final Optional<String> fileName;
+public class TikaTaskTimeout {
 
-    TaskStatus(ServerStatus.TASK task, Instant started, String fileName) {
-        this.task = task;
-        this.started = started;
-        this.fileName = Optional.ofNullable(fileName);
-    }
+    private final long timeoutMillis;
 
+    public TikaTaskTimeout(long timeoutMillis) {
+        this.timeoutMillis = timeoutMillis;
+    }
 
-    @Override
-    public String toString() {
-        return "";
+    public long getTimeoutMillis() {
+        return timeoutMillis;
     }
 
+    public static long getTimeoutMillis(ParseContext context, long defaultTimeoutMillis) {
+        TikaTaskTimeout tikaTaskTimeout = context.get(TikaTaskTimeout.class);
+        if (tikaTaskTimeout == null) {
+            return defaultTimeoutMillis;
+        }
+        return tikaTaskTimeout.getTimeoutMillis();
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
index 9d71e7e..efcf7f0 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
@@ -39,6 +39,7 @@ import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
 import org.apache.tika.config.Param;
+import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
@@ -125,9 +126,9 @@ public class ExternalParser extends AbstractParser implements Initializable {
                     thisCommandLine.add(c);
                 }
             }
-
+            long localTimeoutMillis = TikaTaskTimeout.getTimeoutMillis(context, timeoutMs);
             FileProcessResult result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
-                    timeoutMs, maxStdOut, maxStdErr);
+                    localTimeoutMillis, maxStdOut, maxStdErr);
             metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
             metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
             metadata.set(ExternalProcess.STD_OUT_LENGTH, result.getStdoutLength());
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 48fccd8..abd2398 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -241,6 +241,13 @@ public abstract class TikaTest {
         return getXML(filePath, AUTO_DETECT_PARSER, parseContext);
     }
 
+    protected XMLResult getXML(String filePath, Parser parser, Metadata metadata,
+                               ParseContext parseContext)
+            throws Exception {
+        return getXML(getResourceAsStream("/test-documents/" + filePath), parser,
+                metadata, parseContext);
+    }
+
     protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext)
             throws Exception {
         return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 2cc7aba..c52c21c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -61,6 +61,7 @@ import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
 import org.apache.tika.config.Param;
+import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
@@ -282,7 +283,7 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
                                 "User has selected to preprocess images, " +
                                         "but I can't find ImageMagick." +
                                         "Backing off to original file.");
-                        doOCR(input.toFile(), tmpOCROutputFile, config);
+                        doOCR(input.toFile(), tmpOCROutputFile, config, parseContext);
                     } else {
                         // copy the contents of the original input file into a temporary file
                         // which will be preprocessed for OCR
@@ -291,11 +292,11 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
                             Path tmpFile = tmp.createTempFile();
                             Files.copy(input, tmpFile, StandardCopyOption.REPLACE_EXISTING);
                             imagePreprocessor.process(tmpFile, tmpFile, metadata, config);
-                            doOCR(tmpFile.toFile(), tmpOCROutputFile, config);
+                            doOCR(tmpFile.toFile(), tmpOCROutputFile, config, parseContext);
                         }
                     }
                 } else {
-                    doOCR(input.toFile(), tmpOCROutputFile, config);
+                    doOCR(input.toFile(), tmpOCROutputFile, config, parseContext);
                 }
 
                 String extension = config.getPageSegMode().equals("0") ? "osd" :
@@ -375,7 +376,7 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
      * @throws TikaException if the extraction timed out
      * @throws IOException   if an input error occurred
      */
-    private void doOCR(File input, File output, TesseractOCRConfig config)
+    private void doOCR(File input, File output, TesseractOCRConfig config, ParseContext parseContext)
             throws IOException, TikaException {
 
         ArrayList<String> cmd = new ArrayList<>(
@@ -403,10 +404,12 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
 
         Process process = null;
         String id = null;
+        long timeoutMillis = TikaTaskTimeout.getTimeoutMillis(parseContext,
+                config.getTimeoutSeconds() * 1000);
         try {
             process = pb.start();
             id = register(process);
-            runOCRProcess(process, config.getTimeoutSeconds());
+            runOCRProcess(process, timeoutMillis);
         } finally {
             if (process != null) {
                 process.destroyForcibly();
@@ -417,7 +420,8 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
         }
     }
 
-    private void runOCRProcess(Process process, int timeout) throws IOException, TikaException {
+    private void runOCRProcess(Process process, long timeoutMillis) throws IOException,
+            TikaException {
         process.getOutputStream().close();
         InputStream out = process.getInputStream();
         InputStream err = process.getErrorStream();
@@ -430,7 +434,7 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
 
         int exitValue = Integer.MIN_VALUE;
         try {
-            boolean finished = process.waitFor(timeout, TimeUnit.SECONDS);
+            boolean finished = process.waitFor(timeoutMillis, TimeUnit.MILLISECONDS);
             if (!finished) {
                 throw new TikaException("TesseractOCRParser timeout");
             }
@@ -685,6 +689,13 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
         defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
     }
 
+    /**
+     * Set default timeout in seconds.  This can be overridden per parse
+     * with {@link TikaTaskTimeout} sent in via the {@link ParseContext}
+     * at parse time.
+     *
+     * @param timeout
+     */
     @Field
     public void setTimeout(int timeout) {
         defaultConfig.setTimeoutSeconds(timeout);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 545f132..776e3dd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -34,6 +34,7 @@ import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -175,6 +176,24 @@ public class TesseractOCRParserTest extends TikaTest {
         }
     }
 
+
+    @Test
+    public void testTimeoutOverride() throws Exception {
+        assumeTrue(canRun(), "can run OCR");
+
+        try (InputStream is = getResourceAsStream("/test-configs/TIKA-3582-tesseract.xml")) {
+            TikaConfig config = new TikaConfig(is);
+            Parser p = new AutoDetectParser(config);
+            Metadata m = new Metadata();
+            ParseContext parseContext = new ParseContext();
+            parseContext.set(TikaTaskTimeout.class, new TikaTaskTimeout(50));
+            getXML("testRotated+10.png", p, m, parseContext);
+            fail("should have thrown a timeout");
+        } catch (TikaException e) {
+            assertContains("timeout", e.getMessage());
+        }
+    }
+
     @Test
     public void testPSM0() throws Exception {
         assumeTrue(canRun(), "can run OCR");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-3582-tesseract.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-3582-tesseract.xml
new file mode 100644
index 0000000..a569f93
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-3582-tesseract.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+            <params>
+                <param name="timeout" type="int">360</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatus.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatus.java
index 794dfb2..fff678a 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatus.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatus.java
@@ -35,20 +35,22 @@ public class ServerStatus {
     private Map<Long, TaskStatus> tasks = new HashMap<>();
     private STATUS status = STATUS.OPERATING;
     private volatile long lastStarted = Instant.now().toEpochMilli();
+
     public ServerStatus(String serverId, int numRestarts) {
         this(serverId, numRestarts, false);
     }
+
     public ServerStatus(String serverId, int numRestarts, boolean isLegacy) {
         this.serverId = serverId;
         this.numRestarts = numRestarts;
         this.isLegacy = isLegacy;
     }
 
-    public synchronized long start(TASK task, String fileName) {
+    public synchronized long start(TASK task, String fileName, long timeoutMillis) {
         long taskId = counter.incrementAndGet();
         Instant now = Instant.now();
         lastStarted = now.toEpochMilli();
-        tasks.put(taskId, new TaskStatus(task, now, fileName));
+        tasks.put(taskId, new TaskStatus(task, now, fileName, timeoutMillis));
         return taskId;
     }
 
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java
index 26f47be..adfc90e 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/ServerStatusWatcher.java
@@ -32,6 +32,8 @@ import java.time.Instant;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.tika.server.core.config.TimeoutConfig;
+
 public class ServerStatusWatcher implements Runnable {
 
 
@@ -128,19 +130,19 @@ public class ServerStatusWatcher implements Runnable {
         Instant now = Instant.now();
         for (TaskStatus status : serverStatus.getTasks().values()) {
             long millisElapsed = Duration.between(status.started, now).toMillis();
-            if (millisElapsed > tikaServerConfig.getTaskTimeoutMillis()) {
+            if (millisElapsed > status.timeoutMillis) {
                 serverStatus.setStatus(ServerStatus.STATUS.TIMEOUT);
                 if (status.fileName.isPresent()) {
-                    LOG.error("Timeout task {}, millis elapsed {}, file {}" +
+                    LOG.error("Timeout task {}, millis elapsed {}, timeoutMillis {}, file id {}" +
                                     "consider increasing the allowable time with the " +
-                                    "<taskTimeoutMillis/> parameter", status.task.toString(),
-                            millisElapsed,
-                            status.fileName.get());
+                                    "<taskTimeoutMillis/> parameter or the {} header", status.task.toString(),
+                            millisElapsed, status.timeoutMillis,
+                            status.fileName.get(), TimeoutConfig.X_TIKA_TIMEOUT_MILLIS);
                 } else {
                     LOG.error("Timeout task {}, millis elapsed {}; " +
                                     "consider increasing the allowable time with the " +
-                                    "<taskTimeoutMillis/> parameter", status.task.toString(),
-                            millisElapsed);
+                                    "<taskTimeoutMillis/> parameter or the {} header", status.task.toString(),
+                            millisElapsed, TimeoutConfig.X_TIKA_TIMEOUT_MILLIS);
                 }
             }
         }
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
index 07d1373..016c214 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
@@ -23,17 +23,19 @@ public class TaskStatus {
     final ServerStatus.TASK task;
     final Instant started;
     final Optional<String> fileName;
+    final long timeoutMillis;
 
-    TaskStatus(ServerStatus.TASK task, Instant started, String fileName) {
+    TaskStatus(ServerStatus.TASK task, Instant started, String fileName, long timeoutMillis) {
         this.task = task;
         this.started = started;
         this.fileName = Optional.ofNullable(fileName);
+        this.timeoutMillis = timeoutMillis;
     }
 
 
     @Override
     public String toString() {
-        return "";
+        return "TaskStatus{" + "task=" + task + ", started=" + started + ", fileName=" + fileName +
+                ", timeoutMillis=" + timeoutMillis + '}';
     }
-
 }
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
index 2321686..ccb3933 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
@@ -325,7 +325,8 @@ public class TikaServerProcess {
                     .add(new SingletonResourceProvider(new DetectorResource(serverStatus)));
             resourceProviders.add(new SingletonResourceProvider(new LanguageResource()));
             resourceProviders
-                    .add(new SingletonResourceProvider(new TranslateResource(serverStatus)));
+                    .add(new SingletonResourceProvider(new TranslateResource(serverStatus,
+                            tikaServerConfig.getTaskTimeoutMillis())));
             resourceProviders.add(new SingletonResourceProvider(new TikaResource()));
             resourceProviders.add(new SingletonResourceProvider(new UnpackerResource()));
             resourceProviders.add(new SingletonResourceProvider(new TikaMimeTypes()));
@@ -355,7 +356,8 @@ public class TikaServerProcess {
                 } else if ("language".equals(endPoint)) {
                     resourceProviders.add(new SingletonResourceProvider(new LanguageResource()));
                 } else if ("translate".equals(endPoint)) {
-                    resourceProviders.add(new SingletonResourceProvider(new TranslateResource(serverStatus)));
+                    resourceProviders.add(new SingletonResourceProvider(new TranslateResource(
+                            serverStatus, tikaServerConfig.getTaskTimeoutMillis())));
                 } else if ("tika".equals(endPoint)) {
                     resourceProviders.add(new SingletonResourceProvider(new TikaResource()));
                 } else if ("unpack".equals(endPoint)) {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/config/TimeoutConfig.java
similarity index 51%
copy from tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
copy to tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/config/TimeoutConfig.java
index 07d1373..9954b37 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TaskStatus.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/config/TimeoutConfig.java
@@ -14,26 +14,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.server.core;
+package org.apache.tika.server.core.config;
 
-import java.time.Instant;
-import java.util.Optional;
+import javax.ws.rs.core.MultivaluedMap;
 
-public class TaskStatus {
-    final ServerStatus.TASK task;
-    final Instant started;
-    final Optional<String> fileName;
+import org.apache.tika.config.TikaTaskTimeout;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.server.core.ParseContextConfig;
 
-    TaskStatus(ServerStatus.TASK task, Instant started, String fileName) {
-        this.task = task;
-        this.started = started;
-        this.fileName = Optional.ofNullable(fileName);
-    }
+public class TimeoutConfig implements ParseContextConfig {
 
+    public static final String X_TIKA_TIMEOUT_MILLIS = "X-Tika-Timeout-Millis";
 
     @Override
-    public String toString() {
-        return "";
+    public void configure(MultivaluedMap<String, String> httpHeaders, Metadata metadata,
+                          ParseContext context) {
+        if (httpHeaders.containsKey(X_TIKA_TIMEOUT_MILLIS)) {
+            long timeout = Long.parseLong(httpHeaders.getFirst(X_TIKA_TIMEOUT_MILLIS));
+            context.set(TikaTaskTimeout.class, new TikaTaskTimeout(timeout));
+        }
     }
-
 }
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java
index 62050d4..c89bf2c 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java
@@ -34,6 +34,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.server.core.ServerStatus;
 
 @Path("/detect")
@@ -56,7 +57,10 @@ public class DetectorResource {
         String filename = TikaResource.detectFilename(httpHeaders.getRequestHeaders());
         LOG.info("Detecting media type for Filename: {}", filename);
         met.add(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
-        long taskId = serverStatus.start(ServerStatus.TASK.DETECT, filename);
+        ParseContext parseContext = new ParseContext();
+        TikaResource.fillParseContext(httpHeaders.getRequestHeaders(), met, parseContext);
+        long timeoutMillis = TikaResource.getTaskTimeout(parseContext);
+        long taskId = serverStatus.start(ServerStatus.TASK.DETECT, filename, timeoutMillis);
 
         try (TikaInputStream tis = TikaInputStream
                 .get(TikaResource.getInputStream(is, met, httpHeaders))) {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 5b0a98d..2757cfb 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -340,7 +340,9 @@ public class TikaResource {
 
         checkIsOperating();
         String fileName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
-        long taskId = SERVER_STATUS.start(ServerStatus.TASK.PARSE, fileName);
+        long timeoutMillis = getTaskTimeout(parseContext);
+
+        long taskId = SERVER_STATUS.start(ServerStatus.TASK.PARSE, fileName, timeoutMillis);
         try {
             parser.parse(inputStream, handler, metadata, parseContext);
         } catch (SAXException e) {
@@ -363,6 +365,23 @@ public class TikaResource {
         }
     }
 
+    protected static long getTaskTimeout(ParseContext parseContext) {
+
+        TikaTaskTimeout tikaTaskTimeout = parseContext.get(TikaTaskTimeout.class);
+        long timeoutMillis = TIKA_SERVER_CONFIG.getTaskTimeoutMillis();
+
+        if (tikaTaskTimeout != null) {
+            if (tikaTaskTimeout.getTimeoutMillis() > TIKA_SERVER_CONFIG.getTaskTimeoutMillis()) {
+                throw new IllegalArgumentException("Can't request a timeout ( " +
+                        tikaTaskTimeout.getTimeoutMillis() +
+                        "ms) greater than the taskTimeoutMillis set in the server config (" +
+                        TIKA_SERVER_CONFIG.getTaskTimeoutMillis() + "ms)");
+            }
+            timeoutMillis = tikaTaskTimeout.getTimeoutMillis();
+        }
+        return timeoutMillis;
+    }
+
     public static void checkIsOperating() {
         //check that server is not in shutdown mode
         if (!SERVER_STATUS.isOperating()) {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TranslateResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TranslateResource.java
index 416f208..05d7443 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TranslateResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TranslateResource.java
@@ -47,12 +47,14 @@ public class TranslateResource {
     private final ServerStatus serverStatus;
     private Translator defaultTranslator;
     private ServiceLoader loader;
+    private final long timeoutMillis;
 
-    public TranslateResource(ServerStatus serverStatus) {
+    public TranslateResource(ServerStatus serverStatus, long timeoutMillis) {
         this.loader =
                 new ServiceLoader(ServiceLoader.class.getClassLoader(), LoadErrorHandler.WARN);
         this.defaultTranslator = TikaResource.getConfig().getTranslator();
         this.serverStatus = serverStatus;
+        this.timeoutMillis = timeoutMillis;
     }
 
     @PUT
@@ -82,6 +84,7 @@ public class TranslateResource {
 
         String sLang = language.getLanguage();
         LOG.info("LanguageIdentifier: detected source lang: [{}]", sLang);
+
         return doTranslate(content, translator, sLang, dLang);
     }
 
@@ -94,7 +97,7 @@ public class TranslateResource {
             LOG.info("Using default translator");
         }
         TikaResource.checkIsOperating();
-        long taskId = serverStatus.start(ServerStatus.TASK.TRANSLATE, null);
+        long taskId = serverStatus.start(ServerStatus.TASK.TRANSLATE, null, timeoutMillis);
         try {
             return translate.translate(content, sLang, dLang);
         } catch (OutOfMemoryError e) {
diff --git a/tika-server/tika-server-core/src/main/resources/META-INF/services/org.apache.tika.server.core.ParseContextConfig b/tika-server/tika-server-core/src/main/resources/META-INF/services/org.apache.tika.server.core.ParseContextConfig
index 7996345..b1bd8ce 100644
--- a/tika-server/tika-server-core/src/main/resources/META-INF/services/org.apache.tika.server.core.ParseContextConfig
+++ b/tika-server/tika-server-core/src/main/resources/META-INF/services/org.apache.tika.server.core.ParseContextConfig
@@ -13,4 +13,5 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 org.apache.tika.server.core.config.PasswordProviderConfig
-org.apache.tika.server.core.config.DocumentSelectorConfig
\ No newline at end of file
+org.apache.tika.server.core.config.DocumentSelectorConfig
+org.apache.tika.server.core.config.TimeoutConfig
\ No newline at end of file
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ServerStatusTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ServerStatusTest.java
index 6d8fb56..8888138 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ServerStatusTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ServerStatusTest.java
@@ -85,7 +85,7 @@ public class ServerStatusTest {
             int processed = 0;
             for (int i = 0; i < filesToProcess; i++) {
                 sleepRandom(200);
-                long taskId = serverStatus.start(ServerStatus.TASK.PARSE, null);
+                long taskId = serverStatus.start(ServerStatus.TASK.PARSE, null, 60000);
                 sleepRandom(100);
                 serverStatus.complete(taskId);
                 processed++;
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
index f9b21a0..c073657 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
@@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.server.core.config.TimeoutConfig;
 import org.apache.tika.utils.ProcessUtils;
 
 public class TikaServerIntegrationTest extends IntegrationTestBase {
@@ -102,6 +103,31 @@ public class TikaServerIntegrationTest extends IntegrationTestBase {
     }
 
     @Test
+    public void testTaskTimeoutHeader() throws Exception {
+
+        startProcess(new String[]{"-config", getConfig(
+                "tika-config-server-basic.xml")});
+        awaitServerStartup();
+        String serverId = getServerId();
+        Response response = null;
+        try {
+            response = WebClient.create(endPoint + META_PATH)
+                    .accept("application/json")
+                    .header(TimeoutConfig.X_TIKA_TIMEOUT_MILLIS, 100)
+                    .put(ClassLoader.getSystemResourceAsStream(TEST_HEAVY_HANG));
+        } catch (Exception e) {
+            //oom may or may not cause an exception depending
+            //on the timing
+        }
+        //give some time for the server to crash/terminate itself
+        Thread.sleep(2000);
+        testBaseline();
+        assertEquals(serverId, getServerId());
+        assertTrue(getNumRestarts() > 0);
+        assertTrue(getNumRestarts() < 3);
+    }
+
+    @Test
     public void testSameDeclaredServerIdAfterOOM() throws Exception {
         String serverId = "qwertyuiop";
         startProcess(
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java
index 4b77fdd..f54a837 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TranslateResourceTest.java
@@ -48,7 +48,8 @@ public class TranslateResourceTest extends CXFTestBase {
     protected void setUpResources(JAXRSServerFactoryBean sf) {
         sf.setResourceClasses(TranslateResource.class);
         sf.setResourceProvider(TranslateResource.class,
-                new SingletonResourceProvider(new TranslateResource(new ServerStatus("", 0))));
+                new SingletonResourceProvider(new TranslateResource(
+                        new ServerStatus("", 0), 60000)));
 
     }
 

[tika] 02/03: fix spellings after uppercasing

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit cd0c3ad575d7d87eb9f323bbf47d06db806fbaaa
Author: tballison <ta...@apache.org>
AuthorDate: Tue Oct 26 11:32:59 2021 -0400

    fix spellings after uppercasing
---
 .../java/org/apache/tika/server/core/resource/TikaResource.java  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 220811d..5b0a98d 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -63,6 +63,7 @@ import org.xml.sax.SAXException;
 
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
@@ -103,12 +104,12 @@ public class TikaResource {
 
 
     public static void init(TikaConfig config, TikaServerConfig tikaServerConfg,
-                            DigestingParser.Digester digestr,
-                            InputStreamFactory iSF, ServerStatus serverStatus) {
+                            DigestingParser.Digester digester,
+                            InputStreamFactory inputStreamFactory, ServerStatus serverStatus) {
         TIKA_CONFIG = config;
         TIKA_SERVER_CONFIG = tikaServerConfg;
-        DIGESTER = digestr;
-        INPUTSTREAM_FACTORY = iSF;
+        DIGESTER = digester;
+        INPUTSTREAM_FACTORY = inputStreamFactory;
         SERVER_STATUS = serverStatus;
     }