You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/10 03:05:14 UTC
[1/3] tika git commit: TIKA-2173 - first steps. Need to integrate
parameter configuration into 2.x before I can do the rest
Repository: tika
Updated Branches:
refs/heads/2.x bcd59cee7 -> ab009aeb7
TIKA-2173 - first steps. Need to integrate parameter configuration into 2.x before I can do the rest
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7422218e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7422218e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7422218e
Branch: refs/heads/2.x
Commit: 7422218eb6e76a4f5744cd85c53d08e629fe7976
Parents: bcd59ce
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 9 12:49:32 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 9 12:49:32 2016 -0500
----------------------------------------------------------------------
.../src/test/java/org/apache/tika/TikaTest.java | 9 +++++
.../apache/tika/parser/pdf/AccessChecker.java | 19 ++++++++++
.../org/apache/tika/parser/pdf/PDFParser.java | 37 +++++++++++++++++++-
.../apache/tika/parser/pdf/PDFParserConfig.java | 16 ++++++---
.../apache/tika/parser/pdf/PDFParserTest.java | 27 ++++++++++++++
.../tika/parser/pdf/tika-inline-config.xml | 20 +++++++++++
6 files changed, 123 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index cfed800..0f6303e 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -217,6 +217,15 @@ public abstract class TikaTest {
}
}
+ protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception {
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+ wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+ }
+ return wrapper.getMetadata();
+ }
+
/**
* Basic text extraction.
* <p>
http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
index 775e590..0bb6590 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
@@ -78,4 +78,23 @@ public class AccessChecker implements Serializable {
throw new AccessPermissionException("Content extraction is not allowed.");
}
}
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ AccessChecker checker = (AccessChecker) o;
+
+ if (needToCheck != checker.needToCheck) return false;
+ return allowAccessibility == checker.allowAccessibility;
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = (needToCheck ? 1 : 0);
+ result = 31 * result + (allowAccessibility ? 1 : 0);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 763c82b..185af6a 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -613,10 +613,45 @@ public class PDFParser extends AbstractParser {
*
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
- public void setSortByPosition(boolean v) {
+ void setSortByPosition(boolean v) {
defaultConfig.setSortByPosition(v);
}
+/* void setOcrStrategy(String ocrStrategyString) {
+ defaultConfig.setOcrStrategy(ocrStrategyString);
+ }
+
+ void setOcrImageType(String imageType) {
+ defaultConfig.setOcrImageType(imageType);
+ }
+
+ void setOcrDPI(int dpi) {
+ defaultConfig.setOcrDPI(dpi);
+ }
+*/
+ void setExtractInlineImages(boolean extractInlineImages) {
+ defaultConfig.setExtractInlineImages(extractInlineImages);
+ }
+
+ void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
+ defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
+ }
+
+ void setExtractAcroFormContent(boolean extractAcroFormContent) {
+ defaultConfig.setExtractAcroFormContent(extractAcroFormContent);
+ }
+
+ void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
+ defaultConfig.setIfXFAExtractOnlyXFA(ifXFAExtractOnlyXFA);
+ }
+
+ void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) {
+ defaultConfig.setAccessChecker(new AccessChecker(allowExtractionForAccessibility));
+ }
+
+ void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
+ defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly);
+ }
//can return null!
private Document loadDOM(PDMetadata pdMetadata, ParseContext context) {
http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 296b191..cf43864 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -113,7 +113,7 @@ public class PDFParserConfig implements Serializable {
//with a streams. If this is set to true, Tika's
//parser catches these exceptions, reports them in the metadata
//and then throws the first stored exception after the parse has completed.
- private boolean isCatchIntermediateIOExceptions = true;
+ private boolean catchIntermediateIOExceptions = true;
public PDFParserConfig() {
init(this.getClass().getResourceAsStream("PDFParser.properties"));
@@ -427,12 +427,20 @@ public class PDFParserConfig implements Serializable {
/**
* See {@link #setCatchIntermediateIOExceptions(boolean)}
* @return whether or not to catch IOExceptions
+ * @deprecated use {@link #getCatchIntermediateIOExceptions()}
*/
public boolean isCatchIntermediateIOExceptions() {
- return isCatchIntermediateIOExceptions;
+ return catchIntermediateIOExceptions;
}
/**
+ * See {@link #setCatchIntermediateIOExceptions(boolean)}
+ * @return whether or not to catch IOExceptions
+ */
+ public boolean getCatchIntermediateIOExceptions() {
+ return catchIntermediateIOExceptions;
+ }
+ /**
* The PDFBox parser will throw an IOException if there is
* a problem with a stream. If this is set to <code>true</code>,
* Tika's PDFParser will catch these exceptions and try to parse
@@ -441,7 +449,7 @@ public class PDFParserConfig implements Serializable {
* @param catchIntermediateIOExceptions
*/
public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) {
- isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
+ this.catchIntermediateIOExceptions = catchIntermediateIOExceptions;
}
/**
@@ -608,7 +616,7 @@ public class PDFParserConfig implements Serializable {
", ocrImageType=" + ocrImageType +
", ocrImageFormatName='" + ocrImageFormatName + '\'' +
", accessChecker=" + accessChecker +
- ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions +
+ ", isCatchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index dda0712..2292157 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.pdf;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -34,6 +35,7 @@ import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -48,6 +50,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
@@ -59,6 +62,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.junit.AfterClass;
import org.junit.BeforeClass;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -1220,6 +1224,29 @@ public class PDFParserTest extends TikaTest {
assertContains("Tika - Content", content);
}
+ @Test
+ @Ignore("until we add parameter mods")
+ public void testConfiguringMoreParams() throws Exception {
+ try (InputStream configIs = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-inline-config.xml")) {
+ assertNotNull(configIs);
+ TikaConfig tikaConfig = new TikaConfig(configIs);
+ AutoDetectParser p = new AutoDetectParser(tikaConfig);
+ //make absolutely certain the functionality works!
+ List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p);
+ assertEquals(2, metadata.size());
+ Map<MediaType, Parser> parsers = p.getParsers();
+ Parser composite = parsers.get(MediaType.application("pdf"));
+ Parser pdfParser = ((CompositeParser)composite).getParsers().get(MediaType.application("pdf"));
+ assertTrue(pdfParser instanceof PDFParser);
+ PDFParserConfig pdfParserConfig = ((PDFParser)pdfParser).getPDFParserConfig();
+ assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker());
+ assertEquals(true, pdfParserConfig.getExtractInlineImages());
+ assertEquals(false, pdfParserConfig.getExtractUniqueInlineImagesOnly());
+ //assertEquals(314159, pdfParserConfig.getOcrDPI());
+ assertEquals(false, pdfParserConfig.getCatchIntermediateIOExceptions());
+ }
+ }
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);
http://git-wip-us.apache.org/repos/asf/tika/blob/7422218e/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml b/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
new file mode 100644
index 0000000..9436604
--- /dev/null
+++ b/tika-test-resources/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractInlineImages" type="bool">true</param>
+ <param name="allowExtractionForAccessibility" type="bool">true</param>
+ <param name="catchIntermediateExceptions" type="bool">false</param>
+ <param name="extractUniqueInlineImagesOnly" type="bool">false</param>
+ <param name="catchIntermediateExceptions" type="bool">false</param>
+ <param name="ocrDPI" type="int">314159</param>
+ <!-- we really should throw an exception for this!! -->
+ <param name="someRandomThingOrOther" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
[3/3] tika git commit: TIKA-2159 -- first step
Posted by ta...@apache.org.
TIKA-2159 -- first step
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ab009aeb
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ab009aeb
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ab009aeb
Branch: refs/heads/2.x
Commit: ab009aeb7bb9966972d78827136567e90cfae67c
Parents: f2661f9
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 9 22:05:02 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 9 22:05:02 2016 -0500
----------------------------------------------------------------------
.../tika/extractor/EmbeddedDocumentUtil.java | 168 +++++++++++++++++++
.../tika/parser/jdbc/AbstractDBParser.java | 7 -
.../tika/parser/jdbc/JDBCTableReader.java | 52 ++----
.../tika/parser/pdf/AbstractPDF2XHTML.java | 64 ++++---
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 31 ++--
.../org/apache/tika/parser/pdf/PDFParser.java | 7 +-
.../parser/apple/AppleSingleFileParser.java | 8 +-
.../org/apache/tika/parser/mbox/MboxParser.java | 5 +-
.../tika/parser/mbox/OutlookPSTParser.java | 5 +-
.../microsoft/AbstractPOIFSExtractor.java | 46 ++---
.../tika/parser/microsoft/HSLFExtractor.java | 42 ++---
.../tika/parser/microsoft/OfficeParser.java | 11 +-
.../tika/parser/microsoft/TNEFParser.java | 11 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 12 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 9 +-
.../tika/parser/rtf/RTFEmbObjHandler.java | 61 +------
.../tika/parser/pkg/CompressorParser.java | 7 +-
.../apache/tika/parser/pkg/PackageParser.java | 6 +-
.../org/apache/tika/parser/pkg/RarParser.java | 13 +-
.../tika/parser/xml/FictionBookParser.java | 21 +--
.../tika/parser/mail/MailContentHandler.java | 30 +---
.../tika/parser/mail/RFC822ParserTest.java | 25 ++-
22 files changed, 337 insertions(+), 304 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
new file mode 100644
index 0000000..3ceba90
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.utils.ExceptionUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class to handle common issues with embedded documents.
+ * <p/>
+ * Use statically if all that is needed is getting the EmbeddedDocumentExtractor.
+ * Otherwise, instantiate an instance.
+ * <p/>
+ * Note: This is not thread safe. Make sure to instantiate one per thread.
+ */
+public class EmbeddedDocumentUtil implements Serializable {
+
+ private final ParseContext context;
+ private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+ //these are lazily initialized and can be null
+ private TikaConfig tikaConfig;
+ private MimeTypes mimeTypes;
+ private Detector detector;
+
+ public EmbeddedDocumentUtil(ParseContext context) {
+ this.context = context;
+ this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context);
+ }
+
+ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
+ EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
+ if (extractor == null) {
+ extractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+ return extractor;
+ }
+
+
+ public PasswordProvider getPasswordProvider() {
+ return context.get(PasswordProvider.class);
+ }
+
+ public Detector getDetector() {
+ //be as lazy as possible and cache the detector
+ if (detector == null) {
+ detector = context.get(Detector.class);
+ if (detector == null) {
+ detector = getTikaConfig().getDetector();
+ }
+ }
+ return detector;
+ }
+
+ public MimeTypes getMimeTypes() {
+ //be as lazy as possible and cache the mimeTypes
+ if (mimeTypes == null) {
+ mimeTypes = context.get(MimeTypes.class);
+ if (mimeTypes == null) {
+ mimeTypes = getTikaConfig().getMimeRepository();
+ }
+ }
+ return mimeTypes;
+ }
+
+ public TikaConfig getTikaConfig() {
+ //be as lazy as possible and cache the TikaConfig
+ if (tikaConfig == null) {
+ tikaConfig = context.get(TikaConfig.class);
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ }
+ return tikaConfig;
+ }
+
+ public String getExtension(TikaInputStream is, Metadata metadata) {
+ String mimeString = metadata.get(Metadata.CONTENT_TYPE);
+ TikaConfig config = getConfig();
+ MimeType mimeType = null;
+ MimeTypes types = config.getMimeRepository();
+ boolean detected = false;
+ if (mimeString != null) {
+ try {
+ mimeType = types.forName(mimeString);
+ } catch (MimeTypeException e) {
+ //swallow
+ }
+ }
+ if (mimeType == null) {
+ Detector detector = config.getDetector();
+ try {
+ MediaType mediaType = detector.detect(is, metadata);
+ mimeType = types.forName(mediaType.toString());
+ detected = true;
+ is.reset();
+ } catch (IOException e) {
+ //swallow
+ } catch (MimeTypeException e) {
+ //swallow
+ }
+ }
+ if (mimeType != null) {
+ if (detected) {
+ //set or correct the mime type
+ metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
+ }
+ return mimeType.getExtension();
+ }
+ return ".bin";
+ }
+
+ public TikaConfig getConfig() {
+ TikaConfig config = context.get(TikaConfig.class);
+ if (config == null) {
+ config = TikaConfig.getDefaultConfig();
+ }
+ return config;
+ }
+
+ public static void recordException(Throwable t, Metadata m) {
+ String ex = ExceptionUtils.getFilteredStackTrace(t);
+ m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex);
+ }
+
+ public boolean shouldParseEmbedded(Metadata m) {
+ return getEmbeddedDocumentExtractor().shouldParseEmbedded(m);
+ }
+
+ private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+ return embeddedDocumentExtractor;
+ }
+
+ public void parseEmbedded(InputStream inputStream, ContentHandler handler,
+ Metadata metadata, boolean outputHtml) throws IOException, SAXException {
+ embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
index bba14a0..d613dc5 100644
--- a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
+++ b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
@@ -26,8 +26,6 @@ import java.util.Set;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -46,11 +44,6 @@ abstract class AbstractDBParser extends AbstractParser {
private Connection connection;
- protected static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
- return context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
- }
-
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return null;
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
index ec2470f..f6691e0 100644
--- a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
+++ b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
@@ -21,7 +21,6 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.sql.Blob;
import java.sql.Clob;
import java.sql.Connection;
@@ -38,14 +37,11 @@ import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -65,14 +61,11 @@ class JDBCTableReader {
int maxClobLength = 1000000;
ResultSet results = null;
int rows = 0;
- private TikaConfig tikaConfig = null;
- private Detector detector = null;
- private MimeTypes mimeTypes = null;
-
+ private final EmbeddedDocumentUtil embeddedDocumentUtil;
public JDBCTableReader(Connection connection, String tableName, ParseContext context) {
this.connection = connection;
this.tableName = tableName;
- this.tikaConfig = context.get(TikaConfig.class);
+ embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
}
public boolean nextRow(ContentHandler handler, ParseContext context) throws IOException, SAXException {
@@ -204,8 +197,9 @@ class JDBCTableReader {
//is there a more efficient way to go from a Reader to an InputStream?
String s = clob.getSubString(0, readSize);
- EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context);
- ex.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true);
+ if (embeddedDocumentUtil.shouldParseEmbedded(m)) {
+ embeddedDocumentUtil.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true);
+ }
}
protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex,
@@ -216,8 +210,7 @@ class JDBCTableReader {
m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
m.set(Database.PREFIX + "IS_BLOB", "true");
Blob blob = null;
- InputStream is = null;
- EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context);
+ TikaInputStream is = null;
try {
blob = getBlob(resultSet, columnIndex, m);
if (blob == null) {
@@ -229,20 +222,14 @@ class JDBCTableReader {
((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName);
((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum));
handler.startElement("", "span", "span", attrs);
- MediaType mediaType = getDetector().detect(is, new Metadata());
- String extension = "";
- try {
- MimeType mimeType = getMimeTypes().forName(mediaType.toString());
- m.set(Metadata.CONTENT_TYPE, mimeType.toString());
- extension = mimeType.getExtension();
- } catch (MimeTypeException e) {
- //swallow
- }
+ String extension = embeddedDocumentUtil.getExtension(is, m);
+
m.set(TikaMetadataKeys.RESOURCE_NAME_KEY,
//just in case something screwy is going on with the column name
FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension)));
-
- ex.parseEmbedded(is, handler, m, true);
+ if (embeddedDocumentUtil.shouldParseEmbedded(m)) {
+ embeddedDocumentUtil.parseEmbedded(is, handler, m, true);
+ }
} finally {
if (blob != null) {
@@ -315,24 +302,15 @@ class JDBCTableReader {
protected TikaConfig getTikaConfig() {
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- return tikaConfig;
+ return embeddedDocumentUtil.getTikaConfig();
}
protected Detector getDetector() {
- if (detector != null) return detector;
-
- detector = getTikaConfig().getDetector();
- return detector;
+ return embeddedDocumentUtil.getDetector();
}
protected MimeTypes getMimeTypes() {
- if (mimeTypes != null) return mimeTypes;
-
- mimeTypes = getTikaConfig().getMimeRepository();
- return mimeTypes;
+ return embeddedDocumentUtil.getMimeTypes();
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 44e7032..c175138 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -66,7 +66,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -101,7 +101,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
final PDDocument pdDocument;
final XHTMLContentHandler xhtml;
private final ParseContext context;
- private final Metadata metadata;
+ final Metadata metadata;
+ final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final PDFParserConfig config;
private int pageIndex = 0;
@@ -113,6 +114,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
this.context = context;
this.metadata = metadata;
this.config = config;
+ embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
@Override
@@ -125,15 +127,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
writeParagraphStart();
}
- EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
- EmbeddedDocumentExtractor extractor =
- context.get(EmbeddedDocumentExtractor.class);
- if (extractor == null) {
- extractor = new ParsingEmbeddedDocumentExtractor(context);
- }
- return extractor;
- }
-
private void extractEmbeddedDocuments(PDDocument document)
throws IOException, SAXException, TikaException {
PDDocumentNameDictionary namesDictionary =
@@ -170,31 +163,28 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
PDComplexFileSpecification spec = ent.getValue();
- extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+ extractMultiOSPDEmbeddedFiles(ent.getKey(), spec);
}
}
private void extractMultiOSPDEmbeddedFiles(String displayName,
- PDComplexFileSpecification spec,
- EmbeddedDocumentExtractor extractor) throws IOException,
+ PDComplexFileSpecification spec) throws IOException,
SAXException, TikaException {
if (spec == null) {
return;
}
//current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile());
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac());
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos());
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix());
}
private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
- String fileName, PDEmbeddedFile file,
- EmbeddedDocumentExtractor extractor)
+ String fileName, PDEmbeddedFile file)
throws SAXException, IOException, TikaException {
if (file == null) {
@@ -205,22 +195,30 @@ class AbstractPDF2XHTML extends PDFTextStripper {
fileName = (fileName == null) ? displayName : fileName;
// TODO: other metadata?
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
- metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
- metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+ embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
- if (extractor.shouldParseEmbedded(metadata)) {
+ if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
TikaInputStream stream = null;
try {
- stream = TikaInputStream.get(file.createInputStream());
- extractor.parseEmbedded(
+
+ InputStream rawStream = null;
+ try {
+ rawStream = file.createInputStream();
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ return;
+ }
+ stream = TikaInputStream.get(rawStream);
+ embeddedDocumentExtractor.parseEmbedded(
stream,
new EmbeddedContentHandler(xhtml),
- metadata, false);
+ embeddedMetadata, false);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
@@ -291,14 +289,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
protected void endPage(PDPage page) throws IOException {
try {
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
for (PDAnnotation annotation : page.getAnnotations()) {
if (annotation instanceof PDAnnotationFileAttachment) {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
try {
- extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
+ extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec);
} catch (SAXException e) {
throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
} catch (TikaException e) {
@@ -457,6 +454,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
} catch (XMLStreamException |IOException e) {
//if there was an xml parse exception in xfa, try the AcroForm
+ EmbeddedDocumentUtil.recordException(e, metadata);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index ac9823e..0ae8137 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -46,7 +46,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
@@ -188,15 +188,15 @@ class PDF2XHTML extends AbstractPDF2XHTML {
PDImageXObject image = (PDImageXObject) object;
- Metadata metadata = new Metadata();
+ Metadata embeddedMetadata = new Metadata();
String extension = image.getSuffix();
- if (extension == null) {
- metadata.set(Metadata.CONTENT_TYPE, "image/png");
+ if (extension == null || extension.equals("png")) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
extension = "png";
} else if (extension.equals("jpg")) {
- metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
} else if (extension.equals("tiff")) {
- metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
extension = "tif";
} else {
//TODO: determine if we need to add more image types
@@ -208,7 +208,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
imageNumber = inlineImageCounter++;
}
String fileName = "image" + imageNumber + "."+extension;
- metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
// Output the img tag
AttributesImpl attr = new AttributesImpl();
@@ -226,20 +226,23 @@ class PDF2XHTML extends AbstractPDF2XHTML {
processedInlineImages.put(cosStream, imageNumber);
}
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
- EmbeddedDocumentExtractor extractor =
- getEmbeddedDocumentExtractor();
- if (extractor.shouldParseEmbedded(metadata)) {
+ if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
try {
//TODO: handle image.getMetadata()?
- writeToBuffer(image, extension, buffer);
- extractor.parseEmbedded(
+ try {
+ writeToBuffer(image, extension, buffer);
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ return;
+ }
+ embeddedDocumentExtractor.parseEmbedded(
new ByteArrayInputStream(buffer.toByteArray()),
new EmbeddedContentHandler(xhtml),
- metadata, false);
+ embeddedMetadata, false);
} catch (IOException e) {
handleCatchableIOE(e);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 185af6a..a2e2a74 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -46,6 +46,7 @@ import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.AccessPermissions;
import org.apache.tika.metadata.Metadata;
@@ -207,7 +208,7 @@ public class PDFParser extends AbstractParser {
//now go for the XMP
- Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), context);
+ Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
XMPMetadata xmp = null;
if (dom != null) {
@@ -654,7 +655,7 @@ public class PDFParser extends AbstractParser {
}
//can return null!
- private Document loadDOM(PDMetadata pdMetadata, ParseContext context) {
+ private Document loadDOM(PDMetadata pdMetadata, Metadata parentMetadata, ParseContext context) {
if (pdMetadata == null) {
return null;
}
@@ -663,7 +664,7 @@ public class PDFParser extends AbstractParser {
documentBuilder.setErrorHandler((ErrorHandler)null);
return documentBuilder.parse(is);
} catch (IOException|SAXException|TikaException e) {
- //swallow
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
}
return null;
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index 0f3c044..fa41554 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -29,7 +29,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -79,11 +79,7 @@ public class AppleSingleFileParser extends AbstractParser {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
-
- if (ex == null) {
- ex = new ParsingEmbeddedDocumentExtractor(context);
- }
+ EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
short numEntries = readThroughNumEntries(stream);
long bytesRead = 26;
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 83e26da..a82d74e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -37,7 +37,7 @@ import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -81,8 +81,7 @@ public class MboxParser extends AbstractParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, TikaException, SAXException {
- EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
+ EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
String charsetName = "windows-1252";
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index 5883bd5..dee17db 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -32,7 +32,7 @@ import com.pff.PSTFolder;
import com.pff.PSTMessage;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -69,8 +69,7 @@ public class OutlookPSTParser extends AbstractParser {
throws IOException, SAXException, TikaException {
// Use the delegate parser to parse the contained document
- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
+ EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index a71be5b..725ce8b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -33,8 +33,7 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.DetectorProxy;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -50,11 +49,8 @@ import org.xml.sax.SAXException;
abstract class AbstractPOIFSExtractor {
private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
- private final EmbeddedDocumentExtractor extractor;
+ private final EmbeddedDocumentUtil embeddedDocumentUtil;
private PasswordProvider passwordProvider;
- private TikaConfig tikaConfig;
- private MimeTypes mimeTypes;
- private Detector detector;
private Metadata metadata;
private final Detector zipDetectorProxy;
@@ -63,42 +59,28 @@ abstract class AbstractPOIFSExtractor {
}
protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) {
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
-
- if (ex == null) {
- this.extractor = new ParsingEmbeddedDocumentExtractor(context);
- } else {
- this.extractor = ex;
- }
+ embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
this.passwordProvider = context.get(PasswordProvider.class);
- this.tikaConfig = context.get(TikaConfig.class);
- this.mimeTypes = context.get(MimeTypes.class);
- this.detector = context.get(Detector.class);
this.metadata = metadata;
this.zipDetectorProxy = new DetectorProxy("org.apache.tika.parser.pkg.ZipContainerDetector", getClass().getClassLoader());
}
// Note - these cache, but avoid creating the default TikaConfig if not needed
protected TikaConfig getTikaConfig() {
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- return tikaConfig;
+ return embeddedDocumentUtil.getTikaConfig();
}
protected Detector getDetector() {
- if (detector != null) return detector;
-
- detector = getTikaConfig().getDetector();
- return detector;
+ return embeddedDocumentUtil.getDetector();
}
+ /**
+ * @deprecated use {@link #embeddedDocumentUtil}
+ * @return mimetypes
+ */
protected MimeTypes getMimeTypes() {
- if (mimeTypes != null) return mimeTypes;
-
- mimeTypes = getTikaConfig().getMimeRepository();
- return mimeTypes;
+ return embeddedDocumentUtil.getMimeTypes();
}
/**
@@ -139,8 +121,8 @@ abstract class AbstractPOIFSExtractor {
metadata.set(Metadata.CONTENT_TYPE, mediaType);
}
- if (extractor.shouldParseEmbedded(metadata)) {
- extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
+ if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+ embeddedDocumentUtil.parseEmbedded(resource, xhtml, metadata, outputHtml);
}
} finally {
resource.close();
@@ -249,7 +231,7 @@ abstract class AbstractPOIFSExtractor {
}
// Should we parse it?
- if (extractor.shouldParseEmbedded(metadata)) {
+ if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
if (embedded == null) {
// Make a TikaInputStream that just
// passes the root directory of the
@@ -258,7 +240,7 @@ abstract class AbstractPOIFSExtractor {
embedded = TikaInputStream.get(new byte[0]);
embedded.setOpenContainer(dir);
}
- extractor.parseEmbedded(embedded, xhtml, metadata, true);
+ embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
}
} finally {
if (embedded != null) {
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index ce0ede7..64ec813 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -17,12 +17,12 @@
package org.apache.tika.parser.microsoft;
import java.io.IOException;
+import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.common.usermodel.Hyperlink;
-import org.apache.poi.hslf.exceptions.HSLFException;
import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HeadersFooters;
import org.apache.poi.hslf.model.OLEShape;
@@ -41,6 +41,7 @@ import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TaggedIOException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -51,8 +52,11 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
public class HSLFExtractor extends AbstractPOIFSExtractor {
- public HSLFExtractor(ParseContext context) {
+ private final Metadata metadata;
+
+ public HSLFExtractor(ParseContext context, Metadata metadata) {
super(context);
+ this.metadata = metadata;
}
protected void parse(
@@ -330,17 +334,17 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
break;
}
- try (TikaInputStream picIs = TikaInputStream.get(pic.getData())){
+ byte[] data = null;
+ try {
+ data = pic.getData();
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ continue;
+ }
+ try (TikaInputStream picIs = TikaInputStream.get(data)){
handleEmbeddedResource(
picIs, null, null,
mediaType, xhtml, false);
- } catch (HSLFException e) {
- if (e.getMessage() != null && e.getMessage().contains("incorrect data check")) {
- //TIKA-2157
- //swallow
- } else {
- throw e;
- }
}
}
}
@@ -378,8 +382,14 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
attributes.addAttribute("", "id", "id", "CDATA", objID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
-
- try (TikaInputStream stream = TikaInputStream.get(data.getData())) {
+ InputStream dataStream = null;
+ try {
+ dataStream = data.getData();
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ continue;
+ }
+ try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
@@ -397,13 +407,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
mediaType, xhtml, false);
}
} catch (TaggedIOException e) {
- if ("incorrect data check".equals(e.getMessage())) {
- //TIKA-2130
- //some embedded objects can't be uncompressed correctly
- //swallow
- } else {
- throw e;
- }
+ EmbeddedDocumentUtil.recordException(e, metadata);
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index f7f1c4a..5218dfa 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -43,7 +43,7 @@ import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -130,11 +130,8 @@ public class OfficeParser extends AbstractParser {
parse(root, context, metadata, xhtml);
//now try to get macros
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
- if (ex == null) {
- ex = new ParsingEmbeddedDocumentExtractor(context);
- }
- extractMacros(root.getNFileSystem(), xhtml, ex);
+ extractMacros(root.getNFileSystem(), xhtml,
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
@@ -169,7 +166,7 @@ public class OfficeParser extends AbstractParser {
new WordExtractor(context, metadata).parse(root, xhtml);
break;
case POWERPOINT:
- new HSLFExtractor(context).parse(root, xhtml);
+ new HSLFExtractor(context, metadata).parse(root, xhtml);
break;
case WORKBOOK:
case XLR:
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
index 879546b..484f0c5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
@@ -30,7 +30,7 @@ import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -68,13 +68,8 @@ public class TNEFParser extends AbstractParser {
throws IOException, SAXException, TikaException {
// We work by recursing, so get the appropriate bits
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
- EmbeddedDocumentExtractor embeddedExtractor;
- if (ex == null) {
- embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
- } else {
- embeddedExtractor = ex;
- }
+ EmbeddedDocumentExtractor embeddedExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
// Ask POI to process the file for us
HMEFMessage msg = new HMEFMessage(stream);
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 1f16a3c..f9ba8a6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -39,7 +39,7 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -75,15 +75,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
this.extractor = extractor;
-
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
-
- if (ex == null) {
- embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
- } else {
- embeddedExtractor = ex;
- }
-
+ embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 8334c67..364d81e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -28,7 +28,7 @@ import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -87,16 +87,13 @@ public class WordMLParser extends AbstractXML2003Parser {
Metadata metadata, ParseContext context) {
EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
- if (ex == null) {
- ex = new ParsingEmbeddedDocumentExtractor(context);
- }
-
return new TeeContentHandler(
super.getContentHandler(ch, metadata, context),
new WordMLHandler(ch),
new HyperlinkHandler(ch,
WORD_ML_URL),
- new PictHandler(ch, ex));
+ new PictHandler(ch,
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)));
}
@Override
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index 1334906..dbdc842 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -23,18 +23,11 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.xml.sax.ContentHandler;
@@ -64,9 +57,7 @@ class RTFEmbObjHandler {
private static final String EMPTY_STRING = "";
private final ContentHandler handler;
-
-
- private final ParseContext context;
+ private final EmbeddedDocumentUtil embeddedDocumentUtil;
private final ByteArrayOutputStream os;
//high hex cached for writing hexpair chars (data)
private int hi = -1;
@@ -81,7 +72,7 @@ class RTFEmbObjHandler {
private EMB_STATE state = EMB_STATE.NADA;
protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
this.handler = handler;
- this.context = context;
+ this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
os = new ByteArrayOutputStream();
}
@@ -170,18 +161,14 @@ class RTFEmbObjHandler {
* @throws TikaException
*/
protected void handleCompletedObject() throws IOException, SAXException, TikaException {
- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
- if (embeddedExtractor == null) {
- embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
- }
byte[] bytes = os.toByteArray();
if (state == EMB_STATE.OBJDATA) {
RTFObjDataParser objParser = new RTFObjDataParser();
try {
byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount);
- extractObj(objBytes, handler, embeddedExtractor, metadata);
+ extractObj(objBytes, handler, metadata);
} catch (IOException e) {
//swallow. If anything goes wrong, ignore.
}
@@ -192,7 +179,7 @@ class RTFEmbObjHandler {
metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath));
}
metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
- extractObj(bytes, handler, embeddedExtractor, metadata);
+ extractObj(bytes, handler, metadata);
} else if (state == EMB_STATE.NADA) {
//swallow...no start for pict or embed?!
@@ -200,8 +187,7 @@ class RTFEmbObjHandler {
reset();
}
- private void extractObj(byte[] bytes, ContentHandler handler,
- EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata)
+ private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata)
throws SAXException, IOException, TikaException {
if (bytes == null) {
@@ -210,11 +196,10 @@ class RTFEmbObjHandler {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
- if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
TikaInputStream stream = TikaInputStream.get(bytes);
if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
- String extension = getExtension(stream, metadata);
- stream.reset();
+ String extension = embeddedDocumentUtil.getExtension(stream, metadata);
if (inObject && state == EMB_STATE.PICT) {
metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
metadata.set(RTFMetadata.THUMBNAIL, "true");
@@ -224,7 +209,7 @@ class RTFEmbObjHandler {
}
}
try {
- embeddedExtractor.parseEmbedded(
+ embeddedDocumentUtil.parseEmbedded(
stream,
new EmbeddedContentHandler(handler),
metadata, false);
@@ -234,34 +219,6 @@ class RTFEmbObjHandler {
}
}
- private String getExtension(TikaInputStream is, Metadata metadata) {
- String cType = metadata.get(Metadata.CONTENT_TYPE);
- TikaConfig config = getConfig();
- if (cType == null) {
- Detector detector = config.getDetector();
- try {
- MediaType mediaType = detector.detect(is, metadata);
- MimeTypes types = config.getMimeRepository();
- MimeType mime = types.forName(mediaType.toString());
- metadata.set(Metadata.CONTENT_TYPE, mime.toString());
- return mime.getExtension();
- } catch (IOException e) {
- //swallow
- } catch (MimeTypeException e) {
-
- }
- }
- return ".bin";
- }
-
- private TikaConfig getConfig() {
- TikaConfig config = context.get(TikaConfig.class);
- if (config == null) {
- config = TikaConfig.getDefaultConfig();
- }
- return config;
- }
-
/**
* reset state after each object.
* Do not reset unknown file number.
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 84b3b11..c4cd8de 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -38,7 +38,7 @@ import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -157,9 +157,8 @@ public class CompressorParser extends AbstractParser {
}
// Use the delegate parser to parse the compressed document
- EmbeddedDocumentExtractor extractor = context.get(
- EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
+ EmbeddedDocumentExtractor extractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(cis, xhtml, entrydata, true);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 443eb9e..370efe6 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -43,7 +43,7 @@ import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -168,9 +168,7 @@ public class PackageParser extends AbstractParser {
metadata.set(CONTENT_TYPE, type.toString());
}
// Use the delegate parser to parse the contained document
- EmbeddedDocumentExtractor extractor = context.get(
- EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
+ EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index 99508b0..cf80e47 100644
--- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -21,10 +21,13 @@ import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
+import com.github.junrar.Archive;
+import com.github.junrar.exception.RarException;
+import com.github.junrar.rarfile.FileHeader;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -35,10 +38,6 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import com.github.junrar.Archive;
-import com.github.junrar.exception.RarException;
-import com.github.junrar.rarfile.FileHeader;
-
/**
* Parser for Rar files.
*/
@@ -61,9 +60,7 @@ public class RarParser extends AbstractParser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- EmbeddedDocumentExtractor extractor = context.get(
- EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
+ EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
index e79bbfc..bf06a08 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -16,9 +16,14 @@
*/
package org.apache.tika.parser.xml;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
@@ -28,11 +33,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Set;
-
public class FictionBookParser extends XMLParser {
private static final long serialVersionUID = 4195954546491524374L;
@@ -43,13 +43,8 @@ public class FictionBookParser extends XMLParser {
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
-
- if (ex == null) {
- ex = new ParsingEmbeddedDocumentExtractor(context);
- }
-
- return new BinaryElementsDataHandler(ex, handler);
+ return new BinaryElementsDataHandler(
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
}
private static class BinaryElementsDataHandler extends DefaultHandler {
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 458ed01..00cc6d8 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -47,16 +47,12 @@ import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -141,29 +137,7 @@ class MailContentHandler implements ContentHandler {
// to handle/process the parts/attachments
// Was an EmbeddedDocumentExtractor explicitly supplied?
- this.extractor = context.get(EmbeddedDocumentExtractor.class);
-
- // If there's no EmbeddedDocumentExtractor, then try using a normal parser
- // This will ensure that the contents are made available to the user, so
- // the see the text, but without fine-grained control/extraction
- // (This also maintains backward compatibility with older versions!)
- if (this.extractor == null) {
- // If the user gave a parser, use that, if not the default
- Parser parser = context.get(AutoDetectParser.class);
- if (parser == null) {
- parser = context.get(Parser.class);
- }
- if (parser == null) {
- TikaConfig tikaConfig = context.get(TikaConfig.class);
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- parser = new AutoDetectParser(tikaConfig.getParser());
- }
- ParseContext ctx = new ParseContext();
- ctx.set(Parser.class, parser);
- extractor = new ParsingEmbeddedDocumentExtractor(ctx);
- }
+ this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
public void body(BodyDescriptor body, InputStream is) throws MimeException,
http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 4a506be..877f40f 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -77,9 +77,11 @@ public class RFC822ParserTest extends TikaTest {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822");
ContentHandler handler = mock(DefaultHandler.class);
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, context);
verify(handler).startDocument();
//just one body
verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
@@ -105,9 +107,10 @@ public class RFC822ParserTest extends TikaTest {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822-multipart");
ContentHandler handler = mock(XHTMLContentHandler.class);
-
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, context);
verify(handler).startDocument();
int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
@@ -124,7 +127,7 @@ public class RFC822ParserTest extends TikaTest {
stream = getStream("test-documents/testRFC822-multipart");
handler = new BodyContentHandler();
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, context);
//tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
String bodyText = handler.toString();
assertTrue(bodyText.contains("body 1"));
@@ -141,9 +144,11 @@ public class RFC822ParserTest extends TikaTest {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_quoted");
ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, context);
//tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
String bodyText = handler.toString();
assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
@@ -161,9 +166,11 @@ public class RFC822ParserTest extends TikaTest {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_base64");
ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
try {
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, context);
//tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString());
} catch (Exception e) {
@@ -256,8 +263,10 @@ public class RFC822ParserTest extends TikaTest {
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
- parser.parse(stream, handler, metadata, new ParseContext());
+ parser.parse(stream, handler, metadata, context);
assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
@@ -282,6 +291,7 @@ public class RFC822ParserTest extends TikaTest {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
InputStream stream = getStream("test-documents/testRFC822_encrypted_zip");
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, context);
@@ -337,6 +347,7 @@ public class RFC822ParserTest extends TikaTest {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
InputStream stream = getStream("test-documents/testRFC822_normal_zip");
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, context);
[2/3] tika git commit: TIKA-2174 add jpx and jp2 to Tesseract
Posted by ta...@apache.org.
TIKA-2174 add jpx and jp2 to Tesseract
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f2661f99
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f2661f99
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f2661f99
Branch: refs/heads/2.x
Commit: f2661f997e69fcaf388561f122b306021928a5d4
Parents: 7422218
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 9 12:51:51 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 9 12:51:51 2016 -0500
----------------------------------------------------------------------
.../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 4 +++-
.../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/f2661f99/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 2203a7f..a83d419 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -95,7 +95,9 @@ public class TesseractOCRParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<MediaType>(Arrays.asList(new MediaType[] {
MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
- MediaType.image("x-ms-bmp"), MediaType.image("gif")
+ MediaType.image("x-ms-bmp"), MediaType.image("gif"),
+ MediaType.APPLICATION_XML.image("jp2"),
+ MediaType.image("jpx")
})));
private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
http://git-wip-us.apache.org/repos/asf/tika/blob/f2661f99/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 501364b..8d7e9a9 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -91,7 +91,7 @@ public class TesseractOCRParserTest extends TikaTest {
// Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
assumeTrue(canRun());
- assertEquals(5, parser.getSupportedTypes(parseContext).size());
+ assertEquals(7, parser.getSupportedTypes(parseContext).size());
assertTrue(parser.getSupportedTypes(parseContext).contains(png));
// DefaultParser will now select the TesseractOCRParser.