You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/03 18:09:20 UTC
[tika] branch main updated: Add an interface for rendering engines (#555)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a02c195d1 Add an interface for rendering engines (#555)
a02c195d1 is described below
commit a02c195d162e165c9d3a9ff5938245b96a42bfd1
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue May 3 14:09:14 2022 -0400
Add an interface for rendering engines (#555)
* TIKA-3571 -- add an interface for rendering engines
---
.../java/org/apache/tika/config/TikaConfig.java | 139 ++++++++++++--
.../tika/extractor/EmbeddedDocumentUtil.java | 4 +-
.../java/org/apache/tika/metadata/Rendering.java | 27 +++
.../apache/tika/metadata/TikaCoreProperties.java | 5 +-
.../java/org/apache/tika/parser/DefaultParser.java | 34 +++-
.../org/apache/tika/parser/RenderingParser.java | 24 +++
.../apache/tika/renderer/CompositeRenderer.java | 102 ++++++++++
.../tika/renderer/PageBasedRenderResults.java | 50 +++++
.../org/apache/tika/renderer/PageRangeRequest.java | 60 ++++++
.../org/apache/tika/renderer/RenderRequest.java | 27 +++
.../org/apache/tika/renderer/RenderResult.java | 62 ++++++
.../org/apache/tika/renderer/RenderResults.java | 53 ++++++
.../java/org/apache/tika/renderer/Renderer.java | 62 ++++++
.../org/apache/tika/renderer/RenderingState.java | 26 +++
.../org/apache/tika/renderer/RenderingTracker.java | 31 +++
.../tika-parser-pdf-module/pom.xml | 7 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 212 +++++++++++++++------
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 15 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 18 +-
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 13 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 163 +++++++++++++---
.../apache/tika/parser/pdf/PDFParserConfig.java | 212 +++++++++++----------
.../tika/parser/pdf/TextOnlyPDFRenderer.java | 106 +++++++++++
.../apache/tika/renderer/pdf/MuPDFRenderer.java | 149 +++++++++++++++
.../tika/renderer/pdf/PDDocumentRenderer.java | 27 +++
.../apache/tika/renderer/pdf/PDFBoxRenderer.java | 198 +++++++++++++++++++
.../tika/renderer/pdf/PDFRenderingState.java | 45 +++++
.../apache/tika/parser/pdf/PDFRenderingTest.java | 109 +++++++++++
.../tika/parser/pdf/tika-rendering-config.xml | 25 +++
.../apache/tika/parser/crypto/TSDParserTest.java | 8 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 27 +++
.../configs/tika-rendering-mupdf-config.xml | 25 +++
32 files changed, 1837 insertions(+), 228 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 1606262f6..e7c212f87 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -74,7 +74,10 @@ import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.RenderingParser;
import org.apache.tika.parser.multiple.AbstractMultipleParser;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.Renderer;
import org.apache.tika.utils.AnnotationUtils;
import org.apache.tika.utils.XMLReaderUtils;
@@ -95,6 +98,7 @@ public class TikaConfig {
private final MimeTypes mimeTypes;
private final ExecutorService executorService;
private final EncodingDetector encodingDetector;
+ private final Renderer renderer;
private final MetadataFilter metadataFilter;
private final AutoDetectParserConfig autoDetectParserConfig;
@@ -155,12 +159,14 @@ public class TikaConfig {
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader();
+ RendererXmlLoader rendererXmlLoader = new RendererXmlLoader();
updateXMLReaderUtils(element);
this.mimeTypes = typesFromDomElement(element);
this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
this.encodingDetector = encodingDetectorXmlLoader.loadOverall(element, mimeTypes, loader);
+ this.renderer = rendererXmlLoader.loadOverall(element, mimeTypes, loader);
- ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector);
+ ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector, renderer);
this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
@@ -187,7 +193,8 @@ public class TikaConfig {
this.mimeTypes = getDefaultMimeTypes(loader);
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.encodingDetector = getDefaultEncodingDetector(serviceLoader);
- this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
+ this.renderer = getDefaultRenderer(serviceLoader);
+ this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector, renderer);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
@@ -223,7 +230,8 @@ public class TikaConfig {
this.serviceLoader = new ServiceLoader();
this.mimeTypes = getDefaultMimeTypes(getContextClassLoader());
this.encodingDetector = getDefaultEncodingDetector(serviceLoader);
- this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
+ this.renderer = getDefaultRenderer(serviceLoader);
+ this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector, renderer);
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
@@ -237,6 +245,7 @@ public class TikaConfig {
serviceLoader = serviceLoaderFromDomElement(element, tmpServiceLoader.getLoader());
DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader();
+ RendererXmlLoader rendererLoader = new RendererXmlLoader();
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
@@ -244,8 +253,9 @@ public class TikaConfig {
this.encodingDetector =
encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
+ this.renderer = rendererLoader.loadOverall(element, mimeTypes, serviceLoader);
- ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector);
+ ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector, renderer);
this.parser = parserLoader.loadOverall(element, mimeTypes, serviceLoader);
this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader);
@@ -273,9 +283,12 @@ public class TikaConfig {
return new DefaultEncodingDetector(loader);
}
+ protected static CompositeRenderer getDefaultRenderer(ServiceLoader loader) {
+ return new CompositeRenderer(loader);
+ }
private static CompositeParser getDefaultParser(MimeTypes types, ServiceLoader loader,
- EncodingDetector encodingDetector) {
- return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector);
+ EncodingDetector encodingDetector, Renderer renderer) {
+ return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector, renderer);
}
private static Translator getDefaultTranslator(ServiceLoader loader) {
@@ -811,9 +824,11 @@ public class TikaConfig {
private static class ParserXmlLoader extends XmlLoader<CompositeParser, Parser> {
private final EncodingDetector encodingDetector;
+ private final Renderer renderer;
- private ParserXmlLoader(EncodingDetector encodingDetector) {
+ private ParserXmlLoader(EncodingDetector encodingDetector, Renderer renderer) {
this.encodingDetector = encodingDetector;
+ this.renderer = renderer;
}
boolean supportsComposite() {
@@ -860,7 +875,7 @@ public class TikaConfig {
@Override
CompositeParser createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
- return getDefaultParser(mimeTypes, loader, encodingDetector);
+ return getDefaultParser(mimeTypes, loader, encodingDetector, renderer);
}
@Override
@@ -880,6 +895,15 @@ public class TikaConfig {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
// Try the possible default and composite parser constructors
+ if (parser == null) {
+ try {
+ c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
+ Collection.class, EncodingDetector.class, Renderer.class);
+ parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer);
+ } catch (NoSuchMethodException me) {
+ //swallow
+ }
+ }
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
@@ -948,12 +972,18 @@ public class TikaConfig {
Parser newInstance(Class<? extends Parser> loadedClass)
throws IllegalAccessException, InstantiationException, NoSuchMethodException,
InvocationTargetException {
+ Parser parser = null;
if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) {
Constructor ctor = loadedClass.getConstructor(EncodingDetector.class);
- return (Parser) ctor.newInstance(encodingDetector);
+ parser = (Parser) ctor.newInstance(encodingDetector);
} else {
- return loadedClass.newInstance();
+ parser = loadedClass.newInstance();
}
+
+ if (parser instanceof RenderingParser) {
+ ((RenderingParser)parser).setRenderer(renderer);
+ }
+ return parser;
}
@Override
@@ -1306,7 +1336,7 @@ public class TikaConfig {
c = encodingDetectorClass.getConstructor(List.class);
encodingDetector = c.newInstance(childEncodingDetectors);
} catch (NoSuchMethodException me) {
- LOG.debug("couldn't find constructor for EncodingDetecto(List) for {}",
+ LOG.debug("couldn't find constructor for EncodingDetector(List) for {}",
encodingDetectorClass);
}
}
@@ -1320,4 +1350,91 @@ public class TikaConfig {
}
}
+ private static class RendererXmlLoader
+ extends XmlLoader<Renderer, Renderer> {
+
+ boolean supportsComposite() {
+ return true;
+ }
+
+ String getParentTagName() {
+ return "renderers";
+ }
+
+ String getLoaderTagName() {
+ return "renderer";
+ }
+
+ @Override
+ Class<? extends Renderer> getLoaderClass() {
+ return Renderer.class;
+ }
+
+
+ @Override
+ boolean isComposite(Renderer loaded) {
+ return loaded instanceof CompositeRenderer;
+ }
+
+ @Override
+ boolean isComposite(Class<? extends Renderer> loadedClass) {
+ return CompositeRenderer.class.isAssignableFrom(loadedClass);
+ }
+
+ @Override
+ Renderer preLoadOne(Class<? extends Renderer> loadedClass, String classname,
+ MimeTypes mimeTypes) throws TikaException {
+ // Check for classes which can't be set in config
+ // Continue with normal loading
+ return null;
+ }
+
+ @Override
+ Renderer createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
+ return getDefaultRenderer(loader);
+ }
+
+ @Override
+ Renderer createComposite(List<Renderer> renderers,
+ MimeTypes mimeTypes, ServiceLoader loader) {
+ return new CompositeRenderer(renderers);
+ }
+
+ @Override
+ Renderer createComposite(Class<? extends Renderer> rendererClass,
+ List<Renderer> childRenderers,
+ Set<Class<? extends Renderer>> excludeRenderers,
+ Map<String, Param> params, MimeTypes mimeTypes,
+ ServiceLoader loader)
+ throws InvocationTargetException, IllegalAccessException, InstantiationException {
+ Renderer renderer = null;
+ Constructor<? extends Renderer> c;
+
+ // Try the possible default and composite detector constructors
+ if (renderer == null) {
+ try {
+ c = rendererClass.getConstructor(ServiceLoader.class, Collection.class);
+ renderer = c.newInstance(loader, excludeRenderers);
+ } catch (NoSuchMethodException me) {
+ LOG.debug("couldn't find constructor for service loader + collection for {}",
+ renderer);
+ }
+ }
+ if (renderer == null) {
+ try {
+ c = rendererClass.getConstructor(List.class);
+ renderer = c.newInstance(childRenderers);
+ } catch (NoSuchMethodException me) {
+ LOG.debug("couldn't find constructor for Renderer(List) for {}",
+ rendererClass);
+ }
+ }
+ return renderer;
+ }
+
+ @Override
+ Renderer decorate(Renderer created, Element element) {
+ return created; // No decoration of EncodingDetectors
+ }
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 5854aba28..ccac4f1db 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -92,7 +92,9 @@ public class EmbeddedDocumentUtil implements Serializable {
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
}
- return new ParsingEmbeddedDocumentExtractor(context);
+ EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context);
+ context.set(EmbeddedDocumentExtractor.class, ex);
+ return ex;
}
/**
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
new file mode 100644
index 000000000..73788fef3
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.tika.metadata;
+
+public interface Rendering {
+ String RENDERING_PREFIX = "rendering:";
+
+ Property PAGE_NUMBER = Property.externalInteger(RENDERING_PREFIX + "page_number");
+ Property RENDERED_BY = Property.externalTextBag(RENDERING_PREFIX + "Rendered-By");
+ Property RENDERED_MS = Property.externalReal(RENDERING_PREFIX + "rendering-time-ms");
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index ba138c54f..21581a482 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -277,7 +277,7 @@ public interface TikaCoreProperties {
Property EMBEDDED_RESOURCE_TYPE = Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY,
EmbeddedResourceType.ATTACHMENT.toString(), EmbeddedResourceType.INLINE.toString(),
EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(),
- EmbeddedResourceType.THUMBNAIL.toString());
+ EmbeddedResourceType.THUMBNAIL.toString(), EmbeddedResourceType.RENDERING.toString());
Property HAS_SIGNATURE = Property.internalBoolean("hasSignature");
@@ -302,6 +302,7 @@ public interface TikaCoreProperties {
MACRO, //any code that is intended to be run by the application
METADATA, //e.g. xmp, xfa
FONT,//embedded font files
- THUMBNAIL//TODO: set this in parsers that handle thumbnails
+ THUMBNAIL, //TODO: set this in parsers that handle thumbnails
+ RENDERING //if a file has been rendered
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
index 2abeeed52..336adee93 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
@@ -27,6 +27,8 @@ import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.Renderer;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
@@ -46,25 +48,27 @@ public class DefaultParser extends CompositeParser {
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
Collection<Class<? extends Parser>> excludeParsers,
- EncodingDetector encodingDetector) {
- super(registry, getDefaultParsers(loader, encodingDetector, excludeParsers));
+ EncodingDetector encodingDetector, Renderer renderer) {
+ super(registry, getDefaultParsers(loader, encodingDetector, renderer, excludeParsers));
this.loader = loader;
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
Collection<Class<? extends Parser>> excludeParsers) {
super(registry,
- getDefaultParsers(loader, new DefaultEncodingDetector(loader), excludeParsers));
+ getDefaultParsers(loader, new DefaultEncodingDetector(loader),
+ new CompositeRenderer(loader), excludeParsers));
this.loader = loader;
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
- EncodingDetector encodingDetector) {
- this(registry, loader, Collections.EMPTY_SET, encodingDetector);
+ EncodingDetector encodingDetector, Renderer renderer) {
+ this(registry, loader, Collections.EMPTY_SET, encodingDetector, renderer);
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
- this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader));
+ this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader),
+ new CompositeRenderer(loader));
}
public DefaultParser(MediaTypeRegistry registry, ClassLoader loader) {
@@ -94,6 +98,7 @@ public class DefaultParser extends CompositeParser {
*/
private static List<Parser> getDefaultParsers(ServiceLoader loader,
EncodingDetector encodingDetector,
+ Renderer renderer,
Collection<Class<? extends Parser>>
excludeParsers) {
List<Parser> parsers =
@@ -104,6 +109,11 @@ public class DefaultParser extends CompositeParser {
setEncodingDetector(p, encodingDetector);
}
}
+ if (renderer != null) {
+ for (Parser p : parsers) {
+ setRenderer(p, renderer);
+ }
+ }
ServiceLoaderUtils.sortLoadedClasses(parsers);
return parsers;
}
@@ -122,6 +132,18 @@ public class DefaultParser extends CompositeParser {
}
}
+ private static void setRenderer(Parser p, Renderer renderer) {
+ if (p instanceof RenderingParser) {
+ ((RenderingParser) p).setRenderer(renderer);
+ } else if (p instanceof CompositeParser) {
+ for (Parser child : ((CompositeParser) p).getAllComponentParsers()) {
+ setRenderer(child, renderer);
+ }
+ } else if (p instanceof ParserDecorator) {
+ setRenderer(((ParserDecorator) p).getWrappedParser(), renderer);
+ }
+ }
+
@Override
public Map<MediaType, Parser> getParsers(ParseContext context) {
Map<MediaType, Parser> map = super.getParsers(context);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
new file mode 100644
index 000000000..0daae6be1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */package org.apache.tika.parser;
+
+import org.apache.tika.renderer.Renderer;
+
+public interface RenderingParser {
+
+ void setRenderer(Renderer renderer);
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
new file mode 100644
index 000000000..a98d39c97
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */package org.apache.tika.renderer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+public class CompositeRenderer implements Renderer, Initializable {
+
+ private Map<MediaType, Renderer> rendererMap = new HashMap<>();
+
+ public CompositeRenderer(ServiceLoader serviceLoader) {
+ this(getDefaultRenderers(serviceLoader));
+ }
+
+ public CompositeRenderer(List<Renderer> renderers) {
+ Map<MediaType, Renderer> tmp = new ConcurrentHashMap<>();
+ ParseContext empty = new ParseContext();
+ for (Renderer renderer : renderers) {
+ for (MediaType mt : renderer.getSupportedTypes(empty)) {
+ tmp.put(mt, renderer);
+ }
+ }
+ rendererMap = Collections.unmodifiableMap(tmp);
+ }
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return rendererMap.keySet();
+ }
+
+ @Override
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest... requests) throws IOException, TikaException {
+
+ String mediaTypeString = metadata.get(TikaCoreProperties.TYPE);
+ if (mediaTypeString == null) {
+ throw new TikaException("need to specify file type in metadata");
+ }
+ MediaType mt = MediaType.parse(mediaTypeString);
+ if (mt == null) {
+ throw new TikaException("can't parse mediaType: " + mediaTypeString);
+ }
+ Renderer renderer = rendererMap.get(mt);
+ if (renderer == null) {
+ throw new TikaException("I regret I can't find a renderer for " + mt);
+ }
+ return renderer.render(is, metadata, parseContext, requests);
+ }
+
+ public Renderer getLeafRenderer(MediaType mt) {
+ return rendererMap.get(mt);
+ }
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+
+ }
+
+ private static List<Renderer> getDefaultRenderers(ServiceLoader loader) {
+ List<Renderer> staticRenderers =
+ loader.loadStaticServiceProviders(Renderer.class);
+
+ ServiceLoaderUtils.sortLoadedClasses(staticRenderers);
+ return staticRenderers;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
new file mode 100644
index 000000000..0c238b60d
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Rendering;
+
+public class PageBasedRenderResults extends RenderResults {
+
+ Map<Integer, List<RenderResult>> results = new HashMap<>();
+
+ public PageBasedRenderResults(TemporaryResources tmp) {
+ super(tmp);
+ }
+ public void add(RenderResult result) {
+ Integer page = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
+ if (page != null) {
+ List<RenderResult> pageResults = results.get(page);
+ if (pageResults == null) {
+ pageResults = new ArrayList<>();
+ results.put(page, pageResults);
+ }
+ pageResults.add(result);
+ }
+ super.add(result);
+ }
+
+ public List<RenderResult> getPage(int pageNumber) {
+ return results.get(pageNumber);
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
new file mode 100644
index 000000000..2534d7032
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.util.Objects;
+
+/**
+ * The range of pages to render. These are 1-based, and "to" is inclusive.
+ */
+public class PageRangeRequest implements RenderRequest {
+
+ public static PageRangeRequest RENDER_ALL = new PageRangeRequest(1, -1);
+
+ private final int from;
+ private final int to;
+
+ public PageRangeRequest(int from, int to) {
+ this.from = from;
+ this.to = to;
+ }
+
+ public int getFrom() {
+ return from;
+ }
+
+ public int getTo() {
+ return to;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ PageRangeRequest that = (PageRangeRequest) o;
+ return from == that.from && to == that.to;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(from, to);
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
new file mode 100644
index 000000000..3277d866a
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+/**
+ * Empty interface for requests to a renderer. Different
+ * file formats and different use cases will have different types of requests.
+ * For page based, it could be a page range (render the full pages from 2 to 5);
+ * or it could be a single page with an x-y bounding box. For video files,
+ * it could be a temporal offset or a temporal offset with an x-y bounding box.
+ */
+public interface RenderRequest {
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
new file mode 100644
index 000000000..888b0dd4c
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.nio.file.Path;
+
+import org.apache.tika.metadata.Metadata;
+
+public class RenderResult {
+
+ public enum STATUS {
+ SUCCESS,
+ EXCEPTION,
+ TIMEOUT
+ }
+ private final STATUS status;
+
+ private final int id;
+ private final Path path;
+ //TODO: we're relying on metadata to bring in a bunch of info.
+ //Might be cleaner to add specific parameters for page number, embedded path, etc.?
+ private final Metadata metadata;
+
+ public RenderResult(STATUS status, int id, Path path, Metadata metadata) {
+ this.status = status;
+ this.id = id;
+ this.path = path;
+ this.metadata = metadata;
+ }
+
+ public Path getPath() {
+ return path;
+ }
+
+ public Metadata getMetadata() {
+ return metadata;
+ }
+
+ public STATUS getStatus() {
+ return status;
+ }
+
+ public int getId() {
+ return id;
+ }
+
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
new file mode 100644
index 000000000..12d60d3da
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.io.TemporaryResources;
+
+public class RenderResults implements Closeable {
+
+ private List<RenderResult> results = new ArrayList<>();
+
+ private final TemporaryResources tmp;
+ public RenderResults(TemporaryResources tmp) {
+ this.tmp = tmp;
+ }
+ public void add(RenderResult result) {
+ tmp.addResource(new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete(result.getPath());
+ }
+ });
+ results.add(result);
+ }
+
+ public List<RenderResult> getResults() {
+ return results;
+ }
+
+ @Override
+ public void close() throws IOException {
+ tmp.close();
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
new file mode 100644
index 000000000..bc4261f52
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages
+ * but also on portions of PDF pages as well as on other document types.
+ *
+ */
+public interface Renderer extends Serializable {
+
+
+
+ /**
+ * Returns the set of media types supported by this renderer when used
+ * with the given parse context.
+ *
+ * @param context parse context
+ * @return immutable set of media types
+ * @since Apache Tika 2.5.0
+ */
+ Set<MediaType> getSupportedTypes(ParseContext context);
+
+ RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest ... requests) throws IOException,
+ TikaException;
+
+ /*
+ At some point, we might need/want to add something like this, where for a given
+ page the requestor or the parser determines that they only want to render e.g. a
+ box within a page.
+
+ RenderResults render(InputStream is, int page, Coordinates coordinates, Metadata metadata,
+ ParseContext parseContext) throws IOException,
+ TikaException;
+
+ */
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
new file mode 100644
index 000000000..ed8250065
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+/**
+ * This should be to track state for each file (embedded or otherwise).
+ * This should be reset in the parseContext at the beginning of a parse
+ * and then replaced at the end of the parse.
+ */
+public class RenderingState {
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
new file mode 100644
index 000000000..49c775e69
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+/**
+ * Use this in the ParseContext to keep track of unique ids for rendered
+ * images in embedded docs. This should be used for the full parse of
+ * a main document and its embedded document.
+ */
+public class RenderingTracker {
+
+ private int id = 0;
+
+ public synchronized int getNextId() {
+ return ++id;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
index 5440476bd..be05f67b1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
@@ -74,6 +74,13 @@
<artifactId>jaxb-runtime</artifactId>
<version>${jaxb.version}</version>
</dependency>
+ <!-- incompatible with Apache license, only use in testing -->
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <version>${imageio.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 93dfbd119..344756dd0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -92,7 +92,6 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -105,10 +104,19 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.renderer.pdf.PDDocumentRenderer;
+import org.apache.tika.renderer.pdf.PDFRenderingState;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -156,10 +164,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int unmappedUnicodeCharsPerPage = 0;
int totalCharsPerPage = 0;
- AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context,
+ AbstractPDF2XHTML(PDDocument pdDocument, XHTMLContentHandler xhtml, ParseContext context,
Metadata metadata, PDFParserConfig config) throws IOException {
this.pdDocument = pdDocument;
- this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.xhtml = xhtml;
this.context = context;
this.metadata = metadata;
this.config = config;
@@ -293,9 +301,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private void parseMetadata(InputStream stream, Metadata embeddedMetadata)
throws IOException, SAXException {
try {
- embeddedDocumentExtractor
- .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
- true);
+ embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, true);
} catch (IOException e) {
handleCatchableIOE(e);
}
@@ -318,8 +325,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
- private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
- PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+ private void extractFilesfromEFTree(PDNameTreeNode efTree,
+ Map<String, PDComplexFileSpecification> embeddedFileNames,
+ int depth) throws IOException {
if (depth > MAX_RECURSION_DEPTH) {
throw new IOException("Hit max recursion depth");
}
@@ -434,9 +442,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
xhtml.endElement("div");
try {
- embeddedDocumentExtractor
- .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
- false);
+ embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
} finally {
IOUtils.closeQuietly(stream);
}
@@ -477,43 +484,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
"Please set the OCR_STRATEGY to NO_OCR or configure your" +
"OCR parser correctly");
} else if (ocrStrategy == AUTO) {
- //silently skip
+ //silently skip if there's no parser to run ocr
return;
}
}
- PDFRenderer renderer =
- config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT ?
- new NoTextPDFRenderer(pdDocument) : new PDFRenderer(pdDocument);
-
try (TemporaryResources tmp = new TemporaryResources()) {
- int dpi = config.getOcrDPI();
- Path tmpFile = null;
- try {
- BufferedImage image =
- renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
- tmpFile = tmp.createTempFile();
- try (OutputStream os = Files.newOutputStream(tmpFile)) {
- //TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, dpi,
- config.getOcrImageQuality());
- }
- } catch (SecurityException e) {
- //throw SecurityExceptions immediately
- throw e;
- } catch (IOException | RuntimeException e) {
- //image rendering can throw a variety of runtime exceptions, not just
- // IOExceptions...
- //need to have a wide catch
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
- ExceptionUtils.getStackTrace(e));
- return;
- }
- try (InputStream is = TikaInputStream.get(tmpFile)) {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+ RenderResult renderResult = renderCurrentPage(context, tmp);
+ Metadata renderMetadata = renderResult.getMetadata();
+ try (InputStream is = TikaInputStream.get(renderResult.getPath())) {
+ renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
ocrImageMediaType.toString());
ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- metadata, context);
+ renderMetadata, context);
}
} catch (IOException e) {
handleCatchableIOE(e);
@@ -522,6 +505,121 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private RenderResult renderCurrentPage(ParseContext parseContext,
+ TemporaryResources tmpResources)
+ throws IOException, TikaException {
+ PDFRenderingState renderingState = parseContext.get(PDFRenderingState.class);
+ if (renderingState == null) {
+ noContextRenderCurrentPage(parseContext, tmpResources);
+ }
+ //if the full document has already been rendered, then reuse that file
+ //TODO: we need to prevent this if only a portion of the page or portions
+ //of the page have been rendered.
+ //TODO: we should also figure out how to not reuse the rendering if
+ //the user wants to render twice (say, full color to display to users, but
+ //grayscale for (notionally?) better OCR).
+ PageBasedRenderResults results = (PageBasedRenderResults) renderingState.getRenderResults();
+ if (results != null) {
+ List<RenderResult> pageResults = results.getPage(getCurrentPageNo());
+ if (pageResults.size() == 1) {
+ return pageResults.get(0);
+ }
+ }
+ Renderer thisRenderer = getPDFRenderer(config.getRenderer());
+ //if there's a configured renderer and if the rendering strategy is "all"
+ if (thisRenderer != null &&
+ config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.ALL) {
+ PageRangeRequest pageRangeRequest =
+ new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
+ if (thisRenderer instanceof PDDocumentRenderer) {
+ try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+ tis.setOpenContainer(pdDocument);
+ return thisRenderer.render(tis, m, parseContext, pageRangeRequest)
+ .getResults().get(0);
+ }
+ } else {
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+ PDFRenderingState state = context.get(PDFRenderingState.class);
+ if (state == null) {
+ throw new IllegalArgumentException("RenderingState must not be null");
+ }
+ return thisRenderer
+ .render(state.getTikaInputStream(), m, parseContext, pageRangeRequest)
+ .getResults().get(0);
+ }
+ } else {
+ return noContextRenderCurrentPage(parseContext, tmpResources);
+ }
+ }
+
+ private Renderer getPDFRenderer(Renderer renderer) {
+ if (renderer == null) {
+ return renderer;
+ }
+ if (renderer instanceof CompositeRenderer) {
+ return ((CompositeRenderer)renderer).getLeafRenderer(PDFParser.MEDIA_TYPE);
+ } else if (renderer.getSupportedTypes(context).contains(PDFParser.MEDIA_TYPE)) {
+ return renderer;
+ }
+ return null;
+ }
+
+
+ private RenderResult noContextRenderCurrentPage(ParseContext parseContext,
+ TemporaryResources tmpResources)
+ throws IOException, TikaException {
+ PDFRenderer renderer = null;
+ switch (config.getOcrRenderingStrategy()) {
+ case NO_TEXT:
+ renderer = new NoTextPDFRenderer(pdDocument);
+ break;
+ case TEXT_ONLY:
+ renderer = new TextOnlyPDFRenderer(pdDocument);
+ break;
+ case ALL:
+ renderer = new PDFRenderer(pdDocument);
+ break;
+ }
+
+ int dpi = config.getOcrDPI();
+ Path tmpFile = null;
+ Metadata m = new Metadata();
+ m.set(Rendering.PAGE_NUMBER, pageIndex + 1);
+
+ RenderingTracker renderingTracker = parseContext.get(RenderingTracker.class);
+ if (renderingTracker == null) {
+ renderingTracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, renderingTracker);
+ }
+ int id = renderingTracker.getNextId();
+
+ try {
+ BufferedImage image =
+ renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+ tmpFile = tmpResources.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, dpi,
+ config.getOcrImageQuality());
+ }
+ } catch (SecurityException e) {
+ //throw SecurityExceptions immediately
+ throw e;
+ } catch (IOException | RuntimeException e) {
+ //image rendering can throw a variety of runtime exceptions, not just
+ // IOExceptions...
+ //need to have a wide catch
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
+ ExceptionUtils.getStackTrace(e));
+
+ return new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m);
+ }
+ return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, m);
+ }
+
@Override
protected void endPage(PDPage page) throws IOException {
metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
@@ -547,7 +645,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (fann.getFile() instanceof PDComplexFileSpecification) {
handlePDComplexFileSpec(fann.getAttachmentName(),
"annotationFileAttachment",
- (PDComplexFileSpecification)fann.getFile());
+ (PDComplexFileSpecification) fann.getFile());
}
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
@@ -560,11 +658,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//subtype is U3D or PRC or model/ (prefix for model mime type)
metadata.set(PDF.HAS_3D, true);
}
- for (COSDictionary fileSpec :
- findFileSpecs(annotation.getCOSObject())) {
+ for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) {
PDComplexFileSpecification cfs = new PDComplexFileSpecification(fileSpec);
- handlePDComplexFileSpec(cfs.getFilename(),
- annotationSubtype, cfs);
+ handlePDComplexFileSpec(cfs.getFilename(), annotationSubtype, cfs);
}
}
// TODO: remove once PDFBOX-1143 is fixed:
@@ -619,13 +715,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
boolean unmappedExceedsLimit = false;
if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
// There are enough characters to not have to do OCR. Check number of unmapped characters
- final float percentUnmapped = (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
- final float unmappedCharacterLimit = config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
- unmappedExceedsLimit = (unmappedCharacterLimit < 1)
- ? percentUnmapped > unmappedCharacterLimit
- : unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
+ final float percentUnmapped =
+ (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
+ final float unmappedCharacterLimit =
+ config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
+ unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
+ percentUnmapped > unmappedCharacterLimit :
+ unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
}
- if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() || unmappedExceedsLimit) {
+ if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
+ unmappedExceedsLimit) {
doOCROnCurrentPage(AUTO);
}
}
@@ -664,14 +763,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
}
- private void handlePDComplexFileSpec(String attachmentName,
- String annotationType,
+ private void handlePDComplexFileSpec(String attachmentName, String annotationType,
PDComplexFileSpecification fileSpec) throws IOException {
try {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", annotationType);
- extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec,
- attributes);
+ extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec, attributes);
} catch (SAXException e) {
throw new IOException("file embedded in annotation sax exception", e);
} catch (TikaException e) {
@@ -1130,8 +1227,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
enum ActionTrigger {
- AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS,
- ANNOTATION_CURSOR_EXIT,
+ AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, ANNOTATION_CURSOR_EXIT,
ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, ANNOTATION_MOUSE_RELEASED,
ANNOTATION_PAGE_CLOSED, ANNOTATION_PAGE_NO_LONGER_VISIBLE, ANNOTATION_PAGE_OPENED,
ANNOTATION_PAGE_VISIBLE, ANNOTATION_RECEIVES_FOCUS, ANNOTATION_WIDGET,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 7493253bb..2658a484a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -23,12 +23,12 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -37,9 +37,9 @@ import org.apache.tika.parser.ParseContext;
*/
class OCR2XHTML extends AbstractPDF2XHTML {
- private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context,
+ private OCR2XHTML(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
Metadata metadata, PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, config);
+ super(document, xhtml, context, metadata, config);
}
/**
@@ -47,17 +47,18 @@ class OCR2XHTML extends AbstractPDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
- * @param handler SAX content handler
+ * @param xhtml SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, PDFParserConfig config)
+ public static void process(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ Metadata metadata,
+ PDFParserConfig config)
throws SAXException, TikaException {
OCR2XHTML ocr2XHTML = null;
try {
- ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
+ ocr2XHTML = new OCR2XHTML(document, xhtml, context, metadata, config);
ocr2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 93d1b7e81..602a8823e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -34,12 +34,12 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -63,9 +63,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
- PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDF2XHTML(PDDocument document, XHTMLContentHandler xhtml, ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, config);
+ super(document, xhtml, context, metadata, config);
}
/**
@@ -73,12 +73,12 @@ class PDF2XHTML extends AbstractPDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
- * @param handler SAX content handler
+ * @param xhtml SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument document, ContentHandler handler, ParseContext context,
+ public static void process(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
@@ -88,9 +88,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
// handler.
if (config.isDetectAngles()) {
pdf2XHTML =
- new AngleDetectingPDF2XHTML(document, handler, context, metadata, config);
+ new AngleDetectingPDF2XHTML(document, xhtml, context, metadata, config);
} else {
- pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
+ pdf2XHTML = new PDF2XHTML(document, xhtml, context, metadata, config);
}
config.configure(pdf2XHTML);
@@ -225,10 +225,10 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private static class AngleDetectingPDF2XHTML extends PDF2XHTML {
- private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler,
+ private AngleDetectingPDF2XHTML(PDDocument document, XHTMLContentHandler xhtml,
ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, config);
+ super(document, xhtml, context, metadata, config);
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 5ddf581d9..3e4e1bf64 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -41,12 +41,12 @@ import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructur
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.apache.pdfbox.text.TextPosition;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* <p>This was added in Tika 1.24 as an alpha version of a text extractor
@@ -88,10 +88,10 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
//this stores state as we recurse through the structure tag tree
private State state = new State();
- private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler,
+ private PDFMarkedContent2XHTML(PDDocument document, XHTMLContentHandler xhtml,
ParseContext context, Metadata metadata, PDFParserConfig config)
throws IOException {
- super(document, handler, context, metadata, config);
+ super(document, xhtml, context, metadata, config);
}
/**
@@ -99,19 +99,20 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param pdDocument PDF document
- * @param handler SAX content handler
+ * @param xhtml SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument pdDocument, ContentHandler handler, ParseContext context,
+ public static void process(PDDocument pdDocument, XHTMLContentHandler xhtml,
+ ParseContext context,
Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
try {
pdfMarkedContent2XHTML =
- new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
+ new PDFMarkedContent2XHTML(pdDocument, xhtml, context, metadata, config);
} catch (IOException e) {
throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 3835179b1..28f796157 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -54,6 +54,7 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.AccessPermissions;
import org.apache.tika.metadata.Metadata;
@@ -64,6 +65,13 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RenderingParser;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.pdf.PDFBoxRenderer;
+import org.apache.tika.renderer.pdf.PDFRenderingState;
import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -96,7 +104,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* If your PDFs contain marked content or tags, consider
* {@link PDFParserConfig#setExtractMarkedContent(boolean)}
*/
-public class PDFParser extends AbstractParser implements Initializable {
+public class PDFParser extends AbstractParser implements RenderingParser, Initializable {
/**
* Metadata key for giving the document password to the parser.
@@ -105,7 +113,7 @@ public class PDFParser extends AbstractParser implements Initializable {
* @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
*/
public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
- private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+ protected static final MediaType MEDIA_TYPE = MediaType.application("pdf");
/**
* Serial version UID
*/
@@ -128,12 +136,20 @@ public class PDFParser extends AbstractParser implements Initializable {
if (localConfig.isSetKCMS()) {
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
}
-
+ initRenderer(localConfig);
PDDocument pdfDocument = null;
String password = "";
+ PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class);
try {
- TikaInputStream tstream = TikaInputStream.cast(stream);
+ TikaInputStream tstream;
+ if (shouldSpool(localConfig)) {
+ tstream = TikaInputStream.get(stream);
+ tstream.getPath();
+ context.set(PDFRenderingState.class, new PDFRenderingState(tstream));
+ } else {
+ tstream = TikaInputStream.cast(stream);
+ }
password = getPassword(metadata, context);
MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
if (localConfig.getMaxMainMemoryBytes() >= 0) {
@@ -149,41 +165,104 @@ public class PDFParser extends AbstractParser implements Initializable {
pdfDocument = getPDDocument(new CloseShieldInputStream(stream), password,
memoryUsageSetting, metadata, context);
}
- metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
-
- metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
+ boolean hasXFA = hasXFA(pdfDocument, metadata);
+ boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ tstream.setOpenContainer(pdfDocument);
+ handleRendering(pdfDocument, tstream, xhtml, metadata, context, localConfig);
if (handler != null) {
- boolean hasXFA = hasXFA(pdfDocument);
- metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
- boolean hasMarkedContent = hasMarkedContent(pdfDocument);
- metadata.set(PDF.HAS_MARKED_CONTENT, Boolean.toString(hasMarkedContent));
- boolean hasCollection = hasCollection(pdfDocument);
- metadata.set(PDF.HAS_COLLECTION, Boolean.toString(hasCollection));
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
- handleXFAOnly(pdfDocument, handler, metadata, context);
+ handleXFAOnly(pdfDocument, xhtml, metadata, context);
} else if (localConfig.getOcrStrategy()
.equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
- OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ OCR2XHTML.process(pdfDocument, xhtml, context, metadata, localConfig);
} else if (hasMarkedContent && localConfig.isExtractMarkedContent()) {
PDFMarkedContent2XHTML
- .process(pdfDocument, handler, context, metadata, localConfig);
+ .process(pdfDocument, xhtml, context, metadata,
+ localConfig);
} else {
- PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ PDF2XHTML.process(pdfDocument, xhtml, context, metadata,
+ localConfig);
}
}
} catch (InvalidPasswordException e) {
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
- if (pdfDocument != null) {
- pdfDocument.close();
+ PDFRenderingState currState = context.get(PDFRenderingState.class);
+ try {
+ if (currState != null && currState.getRenderResults() != null) {
+ currState.getRenderResults().close();
+ }
+ if (pdfDocument != null) {
+ pdfDocument.close();
+ }
+ } finally {
+ //replace the one that was here
+ context.set(PDFRenderingState.class, incomingRenderingState);
}
}
}
+ private boolean shouldSpool(PDFParserConfig localConfig) {
+ if (localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+ return true;
+ }
+ if (localConfig.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.NO_OCR) {
+ return false;
+ }
+ //TODO: test that this is not AUTO with no OCR parser installed
+ return true;
+ }
+
+ private void handleRendering(PDDocument pdDocument, TikaInputStream tstream,
+ ContentHandler xhtml, Metadata parentMetadata,
+ ParseContext context,
+ PDFParserConfig config) {
+ if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+ return;
+ }
+ RenderResults renderResults = null;
+ try {
+ renderResults = renderPDF(tstream, context, config);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
+ return;
+ }
+ context.get(PDFRenderingState.class).setRenderResults(renderResults);
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+ for (RenderResult result : renderResults.getResults()) {
+ if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
+ if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
+ try (InputStream is = TikaInputStream.get(result.getPath())) {
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml, result.getMetadata(),
+ false);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
+ }
+ }
+ }
+ }
+ }
+
+ private RenderResults renderPDF(TikaInputStream tstream,
+ ParseContext parseContext, PDFParserConfig localConfig)
+ throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TYPE, MEDIA_TYPE.toString());
+ return localConfig.getRenderer().render(
+ tstream, metadata, parseContext, PageRangeRequest.RENDER_ALL);
+ }
+
protected PDDocument getPDDocument(InputStream inputStream, String password,
MemoryUsageSetting memoryUsageSetting, Metadata metadata,
@@ -197,7 +276,14 @@ public class PDFParser extends AbstractParser implements Initializable {
return PDDocument.load(path.toFile(), password, memoryUsageSetting);
}
+ private boolean hasMarkedContent(PDDocument pdDocument, Metadata metadata) {
+ boolean hasMarkedContent = hasMarkedContent(pdDocument);
+ metadata.set(PDF.HAS_MARKED_CONTENT, hasMarkedContent);
+ return hasMarkedContent;
+ }
+
private boolean hasMarkedContent(PDDocument pdDocument) {
+ boolean hasMarkedContent;
PDStructureTreeRoot root = pdDocument.getDocumentCatalog().getStructureTreeRoot();
if (root == null) {
return false;
@@ -219,6 +305,12 @@ public class PDFParser extends AbstractParser implements Initializable {
return false;
}
+ private boolean hasCollection(PDDocument pdDocument, Metadata metadata) {
+ boolean hasCollection = hasCollection(pdDocument);
+ metadata.set(PDF.HAS_COLLECTION, hasCollection);
+ return hasCollection;
+ }
+
private boolean hasCollection(PDDocument pdfDocument) {
COSDictionary cosDict = pdfDocument.getDocumentCatalog().getCOSObject();
if (cosDict.containsKey(COSName.COLLECTION)) {
@@ -251,6 +343,7 @@ public class PDFParser extends AbstractParser implements Initializable {
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
throws TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
//first extract AccessPermissions
AccessPermission ap = document.getCurrentAccessPermission();
@@ -265,6 +358,8 @@ public class PDFParser extends AbstractParser implements Initializable {
Boolean.toString(ap.canModifyAnnotations()));
metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));
+ hasCollection(document, metadata);
+ metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));
if (document.getDocumentCatalog().getLanguage() != null) {
metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
@@ -371,21 +466,22 @@ public class PDFParser extends AbstractParser implements Initializable {
}
- private boolean hasXFA(PDDocument pdDocument) {
- return pdDocument.getDocumentCatalog() != null &&
+ private boolean hasXFA(PDDocument pdDocument, Metadata metadata) {
+ boolean hasXFA = pdDocument.getDocumentCatalog() != null &&
pdDocument.getDocumentCatalog().getAcroForm(null) != null &&
pdDocument.getDocumentCatalog().getAcroForm(null).hasXFA();
+ metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
+ return hasXFA;
}
private boolean shouldHandleXFAOnly(boolean hasXFA, PDFParserConfig config) {
return config.isIfXFAExtractOnlyXFA() && hasXFA;
}
- private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata,
+ private void handleXFAOnly(PDDocument pdDocument, XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context)
throws SAXException, IOException, TikaException {
XFAExtractor ex = new XFAExtractor();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try (InputStream is = new ByteArrayInputStream(
pdDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes())) {
@@ -622,6 +718,27 @@ public class PDFParser extends AbstractParser implements Initializable {
//no-op
}
+ private void initRenderer(PDFParserConfig config) {
+ if (config.getRenderer() != null) {
+ return;
+ }
+ //set a default renderer if nothing was defined
+ PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
+ pdfBoxRenderer.setDPI(defaultConfig.getOcrDPI());
+ pdfBoxRenderer.setImageType(defaultConfig.getOcrImageType());
+ pdfBoxRenderer.setImageFormatName(defaultConfig.getOcrImageFormatName());
+ config.setRenderer(pdfBoxRenderer);
+ }
+
+ @Override
+ public void setRenderer(Renderer renderer) {
+ defaultConfig.setRenderer(renderer);
+ }
+
+ public void setImageStrategy(String imageStrategy) {
+ defaultConfig.setImageStrategy(imageStrategy);
+ }
+
/**
* Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan
* widgets, which we don't need.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index af0e9617b..fb8a315ae 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -21,6 +21,7 @@ import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.util.HashSet;
import java.util.Locale;
+import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -29,6 +30,7 @@ import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.renderer.Renderer;
/**
* Config for PDFParser.
@@ -112,6 +114,10 @@ public class PDFParserConfig implements Serializable {
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;
+ /**
+ * Should the entire document be rendered?
+ */
+ private IMAGE_STRATEGY imageStrategy = IMAGE_STRATEGY.NONE;
private AccessChecker accessChecker = new AccessChecker();
//The PDFParser can throw IOExceptions if there is a problem
@@ -130,6 +136,8 @@ public class PDFParserConfig implements Serializable {
private boolean detectAngles = false;
+ private Renderer renderer;
+
/**
* @return whether or not to extract only inline image metadata and not render the images
*/
@@ -791,114 +799,76 @@ public class PDFParserConfig implements Serializable {
if (this == o) {
return true;
}
- if (!(o instanceof PDFParserConfig)) {
+ if (o == null || getClass() != o.getClass()) {
return false;
}
-
PDFParserConfig config = (PDFParserConfig) o;
-
- if (isEnableAutoSpace() != config.isEnableAutoSpace()) {
- return false;
- }
- if (isSuppressDuplicateOverlappingText() != config.isSuppressDuplicateOverlappingText()) {
- return false;
- }
- if (isExtractAnnotationText() != config.isExtractAnnotationText()) {
- return false;
- }
- if (isSortByPosition() != config.isSortByPosition()) {
- return false;
- }
- if (isExtractAcroFormContent() != config.isExtractAcroFormContent()) {
- return false;
- }
- if (isExtractBookmarksText() != config.isExtractBookmarksText()) {
- return false;
- }
- if (isExtractInlineImages() != config.isExtractInlineImages()) {
- return false;
- }
- if (isExtractUniqueInlineImagesOnly() != config.isExtractUniqueInlineImagesOnly()) {
- return false;
- }
- if (isIfXFAExtractOnlyXFA() != config.isIfXFAExtractOnlyXFA()) {
- return false;
- }
- if (getOcrDPI() != config.getOcrDPI()) {
- return false;
- }
- if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) {
- return false;
- }
- if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) {
- return false;
- }
- if (!getSpacingTolerance().equals(config.getSpacingTolerance())) {
- return false;
- }
- if (!getDropThreshold().equals(config.getDropThreshold())) {
- return false;
- }
- if (!getOcrStrategy().equals(config.getOcrStrategy())) {
- return false;
- }
- if (getOcrImageType() != config.getOcrImageType()) {
- return false;
- }
- if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) {
- return false;
- }
- if (isExtractActions() != config.isExtractActions()) {
- return false;
- }
- if (!getAccessChecker().equals(config.getAccessChecker())) {
- return false;
- }
- return getMaxMainMemoryBytes() == config.getMaxMainMemoryBytes();
+ return enableAutoSpace == config.enableAutoSpace &&
+ suppressDuplicateOverlappingText == config.suppressDuplicateOverlappingText &&
+ extractAnnotationText == config.extractAnnotationText &&
+ sortByPosition == config.sortByPosition &&
+ extractAcroFormContent == config.extractAcroFormContent &&
+ extractBookmarksText == config.extractBookmarksText &&
+ extractInlineImages == config.extractInlineImages &&
+ extractInlineImageMetadataOnly == config.extractInlineImageMetadataOnly &&
+ extractUniqueInlineImagesOnly == config.extractUniqueInlineImagesOnly &&
+ extractMarkedContent == config.extractMarkedContent &&
+ Float.compare(config.dropThreshold, dropThreshold) == 0 &&
+ ifXFAExtractOnlyXFA == config.ifXFAExtractOnlyXFA && ocrDPI == config.ocrDPI &&
+ Float.compare(config.ocrImageQuality, ocrImageQuality) == 0 &&
+ catchIntermediateIOExceptions == config.catchIntermediateIOExceptions &&
+ extractActions == config.extractActions &&
+ extractFontNames == config.extractFontNames &&
+ maxMainMemoryBytes == config.maxMainMemoryBytes && setKCMS == config.setKCMS &&
+ detectAngles == config.detectAngles &&
+ Objects.equals(userConfigured, config.userConfigured) &&
+ Objects.equals(averageCharTolerance, config.averageCharTolerance) &&
+ Objects.equals(spacingTolerance, config.spacingTolerance) &&
+ ocrStrategy == config.ocrStrategy &&
+ Objects.equals(ocrStrategyAuto, config.ocrStrategyAuto) &&
+ ocrRenderingStrategy == config.ocrRenderingStrategy &&
+ ocrImageType == config.ocrImageType &&
+ Objects.equals(ocrImageFormatName, config.ocrImageFormatName) &&
+ imageStrategy == config.imageStrategy &&
+ Objects.equals(accessChecker, config.accessChecker) &&
+ Objects.equals(renderer, config.renderer);
}
@Override
public int hashCode() {
- int result = (isEnableAutoSpace() ? 1 : 0);
- result = 31 * result + (isSuppressDuplicateOverlappingText() ? 1 : 0);
- result = 31 * result + (isExtractAnnotationText() ? 1 : 0);
- result = 31 * result + (isSortByPosition() ? 1 : 0);
- result = 31 * result + (isExtractAcroFormContent() ? 1 : 0);
- result = 31 * result + (isExtractBookmarksText() ? 1 : 0);
- result = 31 * result + (isExtractInlineImages() ? 1 : 0);
- result = 31 * result + (isExtractUniqueInlineImagesOnly() ? 1 : 0);
- result = 31 * result + getAverageCharTolerance().hashCode();
- result = 31 * result + getSpacingTolerance().hashCode();
- result = 31 * result + getDropThreshold().hashCode();
- result = 31 * result + (isIfXFAExtractOnlyXFA() ? 1 : 0);
- result = 31 * result + ocrStrategy.hashCode();
- result = 31 * result + getOcrDPI();
- result = 31 * result + getOcrImageType().hashCode();
- result = 31 * result + getOcrImageFormatName().hashCode();
- result = 31 * result + getAccessChecker().hashCode();
- result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
- result = 31 * result + (isExtractActions() ? 1 : 0);
- result = 31 * result + Long.valueOf(getMaxMainMemoryBytes()).hashCode();
- return result;
+ return Objects.hash(userConfigured, enableAutoSpace, suppressDuplicateOverlappingText,
+ extractAnnotationText, sortByPosition, extractAcroFormContent, extractBookmarksText,
+ extractInlineImages, extractInlineImageMetadataOnly, extractUniqueInlineImagesOnly,
+ extractMarkedContent, averageCharTolerance, spacingTolerance, dropThreshold,
+ ifXFAExtractOnlyXFA, ocrStrategy, ocrStrategyAuto, ocrRenderingStrategy, ocrDPI,
+ ocrImageType, ocrImageFormatName, ocrImageQuality, imageStrategy, accessChecker,
+ catchIntermediateIOExceptions, extractActions, extractFontNames, maxMainMemoryBytes,
+ setKCMS, detectAngles, renderer);
}
- @Override
- public String toString() {
- return "PDFParserConfig{" + "enableAutoSpace=" + enableAutoSpace +
- ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText +
- ", extractAnnotationText=" + extractAnnotationText + ", sortByPosition=" +
- sortByPosition + ", extractAcroFormContent=" + extractAcroFormContent +
- ", extractBookmarksText=" + extractBookmarksText + ", extractInlineImages=" +
- extractInlineImages + ", extractUniqueInlineImagesOnly=" +
- extractUniqueInlineImagesOnly + ", averageCharTolerance=" + averageCharTolerance +
- ", spacingTolerance=" + spacingTolerance + ", dropThreshold=" + dropThreshold +
- ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", ocrStrategy=" + ocrStrategy +
- ", ocrDPI=" + ocrDPI + ", ocrImageType=" + ocrImageType + ", ocrImageFormatName='" +
- ocrImageFormatName + '\'' + ", accessChecker=" + accessChecker +
- ", extractActions=" + extractActions + ", catchIntermediateIOExceptions=" +
- catchIntermediateIOExceptions + ", maxMainMemoryBytes=" + maxMainMemoryBytes + '}';
+ public void setRenderer(Renderer renderer) {
+ this.renderer = renderer;
}
+ public Renderer getRenderer() {
+ return renderer;
+ }
+
+ public void setImageStrategy(String imageStrategy) {
+ setImageStrategy(PDFParserConfig.IMAGE_STRATEGY.parse(imageStrategy));
+ }
+
+ public void setImageStrategy(IMAGE_STRATEGY imageStrategy) {
+ this.imageStrategy = imageStrategy;
+ userConfigured.add("imageStrategy");
+ }
+
+ public IMAGE_STRATEGY getImageStrategy() {
+ return imageStrategy;
+ }
+
+
+
public enum OCR_STRATEGY {
AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION;
@@ -960,18 +930,22 @@ public class PDFParserConfig implements Serializable {
}
public enum OCR_RENDERING_STRATEGY {
- NO_TEXT, ALL; //AUTO?
- // Would TEXT_ONLY be useful in instances where the unicode mappings
- // are corrupt/non-existent?
+ NO_TEXT, TEXT_ONLY, ALL; //AUTO?
private static OCR_RENDERING_STRATEGY parse(String s) {
if (s == null) {
- return NO_TEXT;
- } else if ("no_text".equals(s.toLowerCase(Locale.ROOT))) {
- return NO_TEXT;
- } else if ("all".equals(s.toLowerCase(Locale.ROOT))) {
return ALL;
}
+ String lc = s.toLowerCase(Locale.US);
+ switch (lc) {
+ case "text_only":
+ return TEXT_ONLY;
+ case "no_text":
+ return NO_TEXT;
+ case "all":
+ return ALL;
+ }
+
StringBuilder sb = new StringBuilder();
sb.append("I regret that I don't recognize '").append(s);
sb.append("' as an OCR_STRATEGY. I only recognize:");
@@ -986,4 +960,34 @@ public class PDFParserConfig implements Serializable {
throw new IllegalArgumentException(sb.toString());
}
}
+
+ public enum IMAGE_STRATEGY {
+ NONE, RAW_IMAGES, RENDERED_PAGES;//TODO: add LOGICAL_IMAGES
+
+ private static IMAGE_STRATEGY parse(String s) {
+ String lc = s.toLowerCase(Locale.US);
+ switch (lc) {
+ case "rawImages" :
+ return RAW_IMAGES;
+ case "renderedPages":
+ return RENDERED_PAGES;
+ case "none":
+ return NONE;
+ default:
+ //fall through to exception
+ break;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append("I regret that I don't recognize '").append(s);
+ sb.append("' as an IMAGE_STRATEGY. I only recognize:");
+ int i = 0;
+ for (IMAGE_STRATEGY strategy : IMAGE_STRATEGY.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(strategy.toString());
+ }
+ throw new IllegalArgumentException(sb.toString());
+ }
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
new file mode 100644
index 000000000..f282d124c
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.Graphics2D;
+import java.awt.geom.Point2D;
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.rendering.PageDrawer;
+import org.apache.pdfbox.rendering.PageDrawerParameters;
+
+/**
+ * This class extends the PDFRenderer to render only the textual
+ * elements
+ */
+public class TextOnlyPDFRenderer extends PDFRenderer {
+
+ public TextOnlyPDFRenderer(PDDocument document) {
+ super(document);
+ }
+
+ /**
+ * Returns a new PageDrawer instance, using the given parameters. May be overridden.
+ */
+ protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException {
+ PageDrawer pageDrawer = new TextOnlyPageDrawer(parameters);
+ pageDrawer.setAnnotationFilter(getAnnotationsFilter());
+ return pageDrawer;
+ }
+
+ private class TextOnlyPageDrawer extends PageDrawer {
+ public TextOnlyPageDrawer(PageDrawerParameters parameters) throws IOException {
+ super(parameters);
+ }
+
+ @Override
+ protected void transferClip(Graphics2D graphics) {
+
+ }
+
+ @Override
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
+
+ }
+
+ @Override
+ public void strokePath() throws IOException {
+
+ }
+
+ @Override
+ public void fillPath(int windingRule) throws IOException {
+ }
+
+ @Override
+ public void fillAndStrokePath(int windingRule) throws IOException {
+ }
+
+ @Override
+ public void clip(int windingRule) {
+ }
+
+ @Override
+ public void lineTo(float x, float y) {
+ }
+
+ @Override
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+ }
+
+ @Override
+ public void closePath() {
+ }
+
+ @Override
+ public void endPath() {
+ }
+
+ @Override
+ public void drawImage(PDImage pdImage) throws IOException {
+
+ }
+
+ @Override
+ public void shadingFill(COSName shadingName) throws IOException {
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
new file mode 100644
index 000000000..983934677
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+public class MuPDFRenderer implements Renderer {
+
+ Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest... requests) throws IOException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ PageBasedRenderResults results = new PageBasedRenderResults(tmp);
+ Path path = TikaInputStream.get(is, tmp).getPath();
+ for (RenderRequest request : requests) {
+ renderRequest(path, metadata, parseContext, request, results, tmp);
+ }
+ return results;
+ }
+
+ private RenderResults renderRequest(Path pdf, Metadata metadata, ParseContext parseContext,
+ RenderRequest request, RenderResults results,
+ TemporaryResources tmp) throws TikaException, IOException {
+ if (! (request instanceof PageRangeRequest)) {
+ throw new TikaException("I regret that this renderer can only handle " +
+ "PageRangeRequests, not " + request.getClass());
+ }
+ PageRangeRequest rangeRequest = (PageRangeRequest)request;
+ RenderingTracker tracker = parseContext.get(RenderingTracker.class);
+ if (tracker == null) {
+ tracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, tracker);
+ }
+
+ Path dir = Files.createTempDirectory("tika-render-");
+ //TODO -- this assumes files have been deleted first
+ //do something smarter
+ tmp.addResource(new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete(dir);
+ }
+ });
+ //TODO -- run mutool pages to get page sizes
+ //and then use that information in the -O to get proper scaling
+ //etc.
+ // This would also allow us to run on a single page at a time if that's of any interest
+ String[] args = createCommandLine(pdf, dir, rangeRequest);
+
+ ProcessBuilder builder = new ProcessBuilder();
+ builder.command(args);
+ //TODO: parameterize timeout
+ FileProcessResult result = ProcessUtils.execute(builder, 60000, 10, 1000);
+ if (result.getExitValue() != 0) {
+ throw new TikaException(result.getStderr());
+ }
+ //TODO -- fix this
+ Matcher m = Pattern.compile("tika-mutool-render-(\\d+)\\.png").matcher("");
+ for (File f : dir.toFile().listFiles()) {
+ String n = f.getName();
+ if (m.reset(n).find()) {
+ int pageIndex = Integer.parseInt(m.group(1));
+ Metadata renderMetadata = new Metadata();
+ renderMetadata.set(Rendering.PAGE_NUMBER, pageIndex);
+ renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
+ results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(),
+ f.toPath(), renderMetadata));
+ }
+ }
+
+ return results;
+ }
+
+ private String[] createCommandLine(Path pdf, Path dir, PageRangeRequest request) {
+ //TODO parameterize all the things; mutool path, colorspace and size and format and...
+ List<String> args = new ArrayList<>();
+ args.add("mutool");
+ args.add("convert");
+ args.add("-O colorspace=gray");
+ args.add("-o");
+ args.add(
+ ProcessUtils.escapeCommandLine(
+ dir.toAbsolutePath().toString() + "/" + "tika-mutool-render-%d.png"));
+ args.add(ProcessUtils.escapeCommandLine(pdf.toAbsolutePath().toString()));
+ if (request != PageRangeRequest.RENDER_ALL) {
+ StringBuilder sb = new StringBuilder();
+ int cnt = 0;
+ for (int i = request.getFrom(); i <= request.getTo(); i++) {
+ if (cnt++ > 0) {
+ sb.append(",");
+ }
+ sb.append(i);
+ }
+ args.add(sb.toString());
+ }
+ return args.toArray(new String[0]);
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
new file mode 100644
index 000000000..7cecd9a23
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import org.apache.tika.renderer.Renderer;
+
+/**
+ * stub interface for the PDFParser to use to figure out if it needs
+ * to pass on the PDDocument or create a temp file to be used
+ * by a file-based renderer down the road.
+ */
+public interface PDDocumentRenderer extends Renderer {
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
new file mode 100644
index 000000000..e5c5d8973
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.RenderingTracker;
+
+public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
+
+ Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+
+ /**
+ * This is the amount of time it takes for PDFBox to render the page
+ */
+ public static Property PDFBOX_RENDERING_TIME_MS =
+ Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-rendering-ms");
+
+ /**
+ * This is the amount of time it takes for PDFBox/java to write the image after
+ * it has been rendered into a BufferedImage. Some formats take much longer
+ * to encode than others.
+ */
+ public static Property PDFBOX_IMAGE_WRITING_TIME_MS =
+ Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-image-writing-ms");
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ private int dpi = 300;
+ private ImageType imageType = ImageType.GRAY;
+ private String imageFormatName = "tiff";
+
+
+ @Override
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest... requests) throws IOException, TikaException {
+
+
+ PDDocument pdDocument;
+ TikaInputStream tis = TikaInputStream.get(is);
+ boolean mustClose = false;
+ if (tis.getOpenContainer() != null) {
+ pdDocument = (PDDocument) tis.getOpenContainer();
+ } else {
+ pdDocument = PDDocument.load(is);
+ mustClose = true;
+ }
+ PageBasedRenderResults results = new PageBasedRenderResults(new TemporaryResources());
+ try {
+ for (RenderRequest renderRequest : requests) {
+ processRequest(renderRequest, pdDocument, metadata, parseContext, results);
+ }
+ } finally {
+ if (mustClose) {
+ pdDocument.close();
+ }
+ }
+ return results;
+ }
+
+ private void processRequest(RenderRequest renderRequest, PDDocument pdDocument,
+ Metadata metadata, ParseContext parseContext,
+ PageBasedRenderResults results) {
+ if (renderRequest == PageRangeRequest.RENDER_ALL || renderRequest.equals(PageRangeRequest.RENDER_ALL)) {
+ renderRange(pdDocument, 1, pdDocument.getNumberOfPages(),
+ metadata, parseContext, results);
+ } else if (renderRequest instanceof PageRangeRequest) {
+ int start = ((PageRangeRequest)renderRequest).getFrom();
+ int toInclusive = ((PageRangeRequest)renderRequest).getTo();
+ renderRange(pdDocument, start, toInclusive, metadata, parseContext, results);
+ }
+ }
+
+ private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
+ ParseContext parseContext, PageBasedRenderResults results) {
+ PDFRenderer renderer = new PDFRenderer(pdDocument);
+ RenderingTracker tracker = parseContext.get(RenderingTracker.class);
+ if (tracker == null) {
+ tracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, tracker);
+ }
+ for (int i = start; i <= endInclusive; i++) {
+ int id = tracker.getNextId();
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
+ try {
+ m.set(Rendering.PAGE_NUMBER, i);
+ Path imagePath = renderPage(renderer, id, i, m);
+ results.add(new RenderResult(RenderResult.STATUS.SUCCESS, id, imagePath, m));
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, m);
+ results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
+ }
+ }
+ }
+
+
+ private Path renderPage(PDFRenderer renderer, int id, int pageNumber, Metadata metadata)
+ throws IOException {
+
+ Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
+ "-" + id + "-" + pageNumber + "." + imageFormatName);
+ try {
+ long start = System.currentTimeMillis();
+ BufferedImage image = renderer.renderImageWithDPI(pageNumber - 1, dpi, imageType);
+ long renderingElapsed = System.currentTimeMillis() - start;
+ metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
+ start = System.currentTimeMillis();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ ImageIOUtil.writeImage(image, imageFormatName, os, dpi);
+ }
+ long elapsedWrite = System.currentTimeMillis() - start;
+ metadata.set(PDFBOX_IMAGE_WRITING_TIME_MS, elapsedWrite);
+ metadata.set(Rendering.RENDERED_MS, renderingElapsed + elapsedWrite);
+ } catch (SecurityException e) {
+ //throw SecurityExceptions immediately
+ throw e;
+ } catch (IOException | RuntimeException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ return tmpFile;
+ }
+
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+ //check file format names
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+
+ }
+
+ public void setDPI(int dpi) {
+ this.dpi = dpi;
+ }
+
+
+ public void setImageType(ImageType imageType) {
+ this.imageType = imageType;
+ }
+
+ public void setImageFormatName(String imageFormatName) {
+ this.imageFormatName = imageFormatName;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
new file mode 100644
index 000000000..51ea0ae96
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.RenderingState;
+
+public class PDFRenderingState extends RenderingState {
+
+ private TikaInputStream tis;
+
+ private RenderResults renderResults;
+
+ public PDFRenderingState(TikaInputStream tis) {
+ this.tis = tis;
+ }
+
+ public TikaInputStream getTikaInputStream() {
+ return tis;
+ }
+
+
+ public void setRenderResults(RenderResults renderResults) {
+ this.renderResults = renderResults;
+ }
+
+ public RenderResults getRenderResults() {
+ return renderResults;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
new file mode 100644
index 000000000..ed61e2a02
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+public class PDFRenderingTest extends TikaTest {
+
+
+ @Test
+ public void testBasic() throws Exception {
+ ParseContext parseContext = configureParseContext();
+ TikaConfig config = getConfig("tika-rendering-config.xml");
+ Parser p = new AutoDetectParser(config);
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p, parseContext);
+ Map<Integer, byte[]> embedded =
+ ((RenderCaptureExtractor)parseContext.get(EmbeddedDocumentExtractor.class))
+ .getEmbedded();
+
+ assertEquals(1, embedded.size());
+ assertTrue(embedded.containsKey(0));
+ //what else can we do to test this? File type == tiff? Run OCR?
+ assertTrue(embedded.get(0).length > 1000);
+
+ assertEquals(2, metadataList.size());
+ Metadata tiffMetadata = metadataList.get(1);
+ assertEquals("RENDERING", tiffMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(1, tiffMetadata.getInt(Rendering.PAGE_NUMBER));
+ }
+
+ private TikaConfig getConfig(String path) throws TikaException, IOException, SAXException {
+ try (InputStream is = PDFRenderingTest.class.getResourceAsStream(path)) {
+ return new TikaConfig(is);
+ }
+ }
+
+ private ParseContext configureParseContext() {
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(EmbeddedDocumentExtractor.class, new RenderCaptureExtractor(parseContext));
+ PDFParserConfig config = new PDFParserConfig();
+ config.setImageStrategy(PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES);
+ parseContext.set(PDFParserConfig.class, config);
+ return parseContext;
+ }
+
+
+ private class RenderCaptureExtractor extends ParsingEmbeddedDocumentExtractor {
+ private int count = 0;
+ Map<Integer, byte[]> embedded = new HashMap<>();
+
+ public RenderCaptureExtractor(ParseContext context) {
+ super(context);
+ }
+
+ public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata,
+ boolean outputHtml) throws SAXException, IOException {
+ TikaInputStream tstream = TikaInputStream.get(stream);
+ byte[] bytes = Files.readAllBytes(tstream.getPath());
+ embedded.put(count++, bytes);
+ try (InputStream is = Files.newInputStream(tstream.getPath())) {
+ super.parseEmbedded(is, handler, metadata, outputHtml);
+ }
+ }
+
+ public Map<Integer, byte[]> getEmbedded() {
+ return embedded;
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
new file mode 100644
index 000000000..5b1351662
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <renderers>
+ <renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
+ </renderers>
+</properties>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 111825101..00cccbdc9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -26,14 +26,20 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
public class TSDParserTest extends TikaTest {
@Test
public void testBrokenPdf() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ parseContext.set(PDFParserConfig.class, config);
//make sure that embedded file appears in list
//and make sure embedded exception is recorded
- List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd");
+ List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd", parseContext);
assertEquals(2, list.size());
assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
assertNotNull(list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8c8d4d068..bac5fc6a9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -46,6 +46,7 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.xml.XMLProfiler;
@@ -62,6 +63,8 @@ public class PDFParserTest extends TikaTest {
public static Level PDFBOX_LOG_LEVEL = Level.INFO;
private static Boolean hasTesseract = null;
+ private static Boolean hasMuPDF = null;
+
public static boolean canRunOCR() throws TikaConfigException {
if (hasTesseract != null) {
return hasTesseract;
@@ -70,6 +73,14 @@ public class PDFParserTest extends TikaTest {
return hasTesseract;
}
+ public static boolean hasMuPDF() throws TikaConfigException {
+ if (hasMuPDF != null) {
+ return hasMuPDF;
+ }
+ hasMuPDF = ExternalParser.check(new String[]{"mutool", "-v"});
+ return hasMuPDF;
+ }
+
@BeforeAll
public static void setup() {
//remember default logging level, but turn off for PDFParserTest
@@ -440,4 +451,20 @@ public class PDFParserTest extends TikaTest {
}
}
+ @Test
+ public void testMuPDFInOCR() throws Exception {
+ //TODO -- need to add "rendered by" to confirm that mutool was actually called
+ //and that there wasn't some backoff to PDFBox the PDFParser
+ assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(hasMuPDF(), "has mupdf");
+ try (InputStream is = getResourceAsStream(
+ "/configs/tika-rendering-mupdf-config.xml")) {
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+ String text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p);
+ assertContains("Happy", text.trim());
+ }
+ }
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
new file mode 100644
index 000000000..1034d05eb
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <renderers>
+ <renderer class="org.apache.tika.renderer.pdf.MuPDFRenderer"/>
+ </renderers>
+</properties>