You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/11/27 19:53:56 UTC
[tika] branch master updated: TIKA-2511 Cache TikaConfig in
EmbeddedDocumentUtil for faster processing of files with lots of embedded
files.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new ce4d948 TIKA-2511 Cache TikaConfig in EmbeddedDocumentUtil for faster processing of files with lots of embedded files.
ce4d948 is described below
commit ce4d94888557a08f0535038e1586f235af0065b3
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Nov 27 14:53:48 2017 -0500
TIKA-2511 Cache TikaConfig in EmbeddedDocumentUtil for faster processing
of files with lots of embedded files.
---
CHANGES.txt | 3 ++
.../tika/extractor/EmbeddedDocumentUtil.java | 56 +++++++++++++++-------
2 files changed, 42 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 8c74689..38ec856 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.17 - ???
+ * Cache TikaConfig in EmbeddedDocumentUtil for better performance
+ in documents with large number of attachments (TIKA-2511).
+
* Extract media files from ooxml (TIKA-2510).
* Standardize the way the Image and Video captioning
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 5e1f054..c03a871 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -51,6 +51,7 @@ import org.xml.sax.SAXException;
*/
public class EmbeddedDocumentUtil implements Serializable {
+
private final ParseContext context;
private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
//these are lazily initialized and can be null
@@ -99,27 +100,38 @@ public class EmbeddedDocumentUtil implements Serializable {
}
public Detector getDetector() {
- //be as lazy as possible and cache the detector
- if (detector == null) {
- detector = context.get(Detector.class);
- if (detector == null) {
- detector = getTikaConfig().getDetector();
- }
+ //be as lazy as possible and cache
+ Detector localDetector = context.get(Detector.class);
+ if (localDetector != null) {
+ return localDetector;
}
+ if (detector != null) {
+ return detector;
+ }
+
+ detector = getTikaConfig().getDetector();
return detector;
}
public MimeTypes getMimeTypes() {
+ MimeTypes localMimeTypes = context.get(MimeTypes.class);
//be as lazy as possible and cache the mimeTypes
- if (mimeTypes == null) {
- mimeTypes = context.get(MimeTypes.class);
- if (mimeTypes == null) {
- mimeTypes = getTikaConfig().getMimeRepository();
- }
+ if (localMimeTypes != null) {
+ return localMimeTypes;
+ }
+ if (mimeTypes != null) {
+ return mimeTypes;
}
+ mimeTypes = getTikaConfig().getMimeRepository();
return mimeTypes;
}
+ /**
+ * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext
+ * that was included during initialization, and then creating a new one from
+ * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the
+ * ParseContext. This caches the default config so that it only has to be created once.
+ */
public TikaConfig getTikaConfig() {
//be as lazy as possible and cache the TikaConfig
if (tikaConfig == null) {
@@ -133,22 +145,23 @@ public class EmbeddedDocumentUtil implements Serializable {
public String getExtension(TikaInputStream is, Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
- TikaConfig config = getConfig();
+
+ //use the buffered mimetypes as default
+ MimeTypes localMimeTypes = getMimeTypes();
+
MimeType mimeType = null;
- MimeTypes types = config.getMimeRepository();
boolean detected = false;
if (mimeString != null) {
try {
- mimeType = types.forName(mimeString);
+ mimeType = localMimeTypes.forName(mimeString);
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType == null) {
- Detector detector = config.getDetector();
try {
- MediaType mediaType = detector.detect(is, metadata);
- mimeType = types.forName(mediaType.toString());
+ MediaType mediaType = getDetector().detect(is, metadata);
+ mimeType = localMimeTypes.forName(mediaType.toString());
detected = true;
is.reset();
} catch (IOException e) {
@@ -167,6 +180,15 @@ public class EmbeddedDocumentUtil implements Serializable {
return ".bin";
}
+ /**
+ * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext
+ * that was included in the initialization, and then creating a new one from
+ * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the
+ * ParseContext.
+ *
+ * @deprecated as of 1.17, use {@link #getTikaConfig()} instead
+ */
+ @Deprecated
public TikaConfig getConfig() {
TikaConfig config = context.get(TikaConfig.class);
if (config == null) {
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].