You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/11/27 19:53:56 UTC

[tika] branch master updated: TIKA-2511 Cache TikaConfig in EmbeddedDocumentUtil for faster processing of files with lots of embedded files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new ce4d948  TIKA-2511 Cache TikaConfig in EmbeddedDocumentUtil for faster processing of files with lots of embedded files.
ce4d948 is described below

commit ce4d94888557a08f0535038e1586f235af0065b3
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Nov 27 14:53:48 2017 -0500

    TIKA-2511 Cache TikaConfig in EmbeddedDocumentUtil for faster processing
    of files with lots of embedded files.
---
 CHANGES.txt                                        |  3 ++
 .../tika/extractor/EmbeddedDocumentUtil.java       | 56 +++++++++++++++-------
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8c74689..38ec856 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.17 - ???
 
+  * Cache TikaConfig in EmbeddedDocumentUtil for better performance
+    in documents with large number of attachments (TIKA-2511).
+
   * Extract media files from ooxml (TIKA-2510).
 
   * Standardize the way the Image and Video captioning 
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 5e1f054..c03a871 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -51,6 +51,7 @@ import org.xml.sax.SAXException;
  */
 public class EmbeddedDocumentUtil implements Serializable {
 
+
     private final ParseContext context;
     private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
     //these are lazily initialized and can be null
@@ -99,27 +100,38 @@ public class EmbeddedDocumentUtil implements Serializable {
     }
 
     public Detector getDetector() {
-        //be as lazy as possible and cache the detector
-        if (detector == null) {
-            detector = context.get(Detector.class);
-            if (detector == null) {
-                detector = getTikaConfig().getDetector();
-            }
+        //be as lazy as possible and cache
+        Detector localDetector = context.get(Detector.class);
+        if (localDetector != null) {
+            return localDetector;
         }
+        if (detector != null) {
+            return detector;
+        }
+
+        detector = getTikaConfig().getDetector();
         return detector;
     }
 
     public MimeTypes getMimeTypes() {
+        MimeTypes localMimeTypes = context.get(MimeTypes.class);
         //be as lazy as possible and cache the mimeTypes
-        if (mimeTypes == null) {
-            mimeTypes = context.get(MimeTypes.class);
-            if (mimeTypes == null) {
-                mimeTypes = getTikaConfig().getMimeRepository();
-            }
+        if (localMimeTypes != null) {
+            return localMimeTypes;
+        }
+        if (mimeTypes != null) {
+            return mimeTypes;
         }
+        mimeTypes = getTikaConfig().getMimeRepository();
         return mimeTypes;
     }
 
+    /**
+     * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext
+     * that was included during initialization, and then creating a new one from
+     * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the
+     * ParseContext. This caches the default config so that it only has to be created once.
+     */
     public TikaConfig getTikaConfig() {
         //be as lazy as possible and cache the TikaConfig
         if (tikaConfig == null) {
@@ -133,22 +145,23 @@ public class EmbeddedDocumentUtil implements Serializable {
 
     public String getExtension(TikaInputStream is, Metadata metadata) {
         String mimeString = metadata.get(Metadata.CONTENT_TYPE);
-        TikaConfig config = getConfig();
+
+        //use the buffered mimetypes as default
+        MimeTypes localMimeTypes = getMimeTypes();
+
         MimeType mimeType = null;
-        MimeTypes types = config.getMimeRepository();
         boolean detected = false;
         if (mimeString != null) {
             try {
-                mimeType = types.forName(mimeString);
+                mimeType = localMimeTypes.forName(mimeString);
             } catch (MimeTypeException e) {
                 //swallow
             }
         }
         if (mimeType == null) {
-            Detector detector = config.getDetector();
             try {
-                MediaType mediaType = detector.detect(is, metadata);
-                mimeType = types.forName(mediaType.toString());
+                MediaType mediaType = getDetector().detect(is, metadata);
+                mimeType = localMimeTypes.forName(mediaType.toString());
                 detected = true;
                 is.reset();
             } catch (IOException e) {
@@ -167,6 +180,15 @@ public class EmbeddedDocumentUtil implements Serializable {
         return ".bin";
     }
 
+    /**
+     * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext
+     * that was included in the initialization, and then creating a new one from
+     * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the
+     * ParseContext.
+     *
+     * @deprecated as of 1.17, use {@link #getTikaConfig()} instead
+     */
+    @Deprecated
     public TikaConfig getConfig() {
         TikaConfig config = context.get(TikaConfig.class);
         if (config == null) {

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].