You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/26 21:51:03 UTC
[tika] 01/01: TIKA-3571 -- WIP, checkpoint commit. Do not merge.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3571
in repository https://gitbox.apache.org/repos/asf/tika.git
commit c3261d6d0b272f452b1343e2491da7b75adb64db
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 26 17:50:44 2022 -0400
TIKA-3571 -- WIP, checkpoint commit. Do not merge.
---
.../java/org/apache/tika/config/TikaConfig.java | 130 +++++++++++++++++--
.../java/org/apache/tika/metadata/Rendering.java | 27 ++++
.../apache/tika/metadata/TikaCoreProperties.java | 5 +-
.../java/org/apache/tika/parser/DefaultParser.java | 34 ++++-
.../org/apache/tika/parser/RenderingParser.java | 24 ++++
.../apache/tika/renderer/CompositeRenderer.java | 99 +++++++++++++++
.../org/apache/tika/renderer/RenderResult.java | 52 ++++++++
.../org/apache/tika/renderer/RenderResults.java | 53 ++++++++
.../java/org/apache/tika/renderer/Renderer.java | 49 +++++++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 34 +++--
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 8 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 11 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 32 ++++-
.../apache/tika/parser/pdf/PDFParserConfig.java | 11 ++
.../apache/tika/renderer/pdf/PDFBoxRenderer.java | 141 +++++++++++++++++++++
15 files changed, 668 insertions(+), 42 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 1606262f6..d16f6f171 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -74,7 +74,10 @@ import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.RenderingParser;
import org.apache.tika.parser.multiple.AbstractMultipleParser;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.Renderer;
import org.apache.tika.utils.AnnotationUtils;
import org.apache.tika.utils.XMLReaderUtils;
@@ -95,6 +98,7 @@ public class TikaConfig {
private final MimeTypes mimeTypes;
private final ExecutorService executorService;
private final EncodingDetector encodingDetector;
+ private final Renderer renderer;
private final MetadataFilter metadataFilter;
private final AutoDetectParserConfig autoDetectParserConfig;
@@ -155,12 +159,14 @@ public class TikaConfig {
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader();
+ RendererXmlLoader rendererXmlLoader = new RendererXmlLoader();
updateXMLReaderUtils(element);
this.mimeTypes = typesFromDomElement(element);
this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
this.encodingDetector = encodingDetectorXmlLoader.loadOverall(element, mimeTypes, loader);
+ this.renderer = rendererXmlLoader.loadOverall(element, mimeTypes, loader);
- ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector);
+ ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector, renderer);
this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
@@ -187,7 +193,8 @@ public class TikaConfig {
this.mimeTypes = getDefaultMimeTypes(loader);
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.encodingDetector = getDefaultEncodingDetector(serviceLoader);
- this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
+ this.renderer = getDefaultRenderer(serviceLoader);
+ this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector, renderer);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
@@ -223,7 +230,8 @@ public class TikaConfig {
this.serviceLoader = new ServiceLoader();
this.mimeTypes = getDefaultMimeTypes(getContextClassLoader());
this.encodingDetector = getDefaultEncodingDetector(serviceLoader);
- this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
+ this.renderer = getDefaultRenderer(serviceLoader);
+ this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector, renderer);
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
@@ -237,6 +245,7 @@ public class TikaConfig {
serviceLoader = serviceLoaderFromDomElement(element, tmpServiceLoader.getLoader());
DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader();
+ RendererXmlLoader rendererLoader = new RendererXmlLoader();
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
@@ -244,8 +253,9 @@ public class TikaConfig {
this.encodingDetector =
encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
+ this.renderer = rendererLoader.loadOverall(element, mimeTypes, serviceLoader);
- ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector);
+ ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector, renderer);
this.parser = parserLoader.loadOverall(element, mimeTypes, serviceLoader);
this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader);
@@ -273,9 +283,12 @@ public class TikaConfig {
return new DefaultEncodingDetector(loader);
}
+ protected static CompositeRenderer getDefaultRenderer(ServiceLoader loader) {
+ return new CompositeRenderer(loader);
+ }
private static CompositeParser getDefaultParser(MimeTypes types, ServiceLoader loader,
- EncodingDetector encodingDetector) {
- return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector);
+ EncodingDetector encodingDetector, Renderer renderer) {
+ return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector, renderer);
}
private static Translator getDefaultTranslator(ServiceLoader loader) {
@@ -811,9 +824,11 @@ public class TikaConfig {
private static class ParserXmlLoader extends XmlLoader<CompositeParser, Parser> {
private final EncodingDetector encodingDetector;
+ private final Renderer renderer;
- private ParserXmlLoader(EncodingDetector encodingDetector) {
+ private ParserXmlLoader(EncodingDetector encodingDetector, Renderer renderer) {
this.encodingDetector = encodingDetector;
+ this.renderer = renderer;
}
boolean supportsComposite() {
@@ -860,7 +875,7 @@ public class TikaConfig {
@Override
CompositeParser createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
- return getDefaultParser(mimeTypes, loader, encodingDetector);
+ return getDefaultParser(mimeTypes, loader, encodingDetector, renderer);
}
@Override
@@ -948,12 +963,18 @@ public class TikaConfig {
Parser newInstance(Class<? extends Parser> loadedClass)
throws IllegalAccessException, InstantiationException, NoSuchMethodException,
InvocationTargetException {
+ Parser parser = null;
if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) {
Constructor ctor = loadedClass.getConstructor(EncodingDetector.class);
- return (Parser) ctor.newInstance(encodingDetector);
+ parser = (Parser) ctor.newInstance(encodingDetector);
} else {
- return loadedClass.newInstance();
+ parser = loadedClass.newInstance();
+ }
+
+ if (parser instanceof RenderingParser) {
+ ((RenderingParser)parser).setRenderer(renderer);
}
+ return parser;
}
@Override
@@ -1306,7 +1327,7 @@ public class TikaConfig {
c = encodingDetectorClass.getConstructor(List.class);
encodingDetector = c.newInstance(childEncodingDetectors);
} catch (NoSuchMethodException me) {
- LOG.debug("couldn't find constructor for EncodingDetecto(List) for {}",
+ LOG.debug("couldn't find constructor for EncodingDetector(List) for {}",
encodingDetectorClass);
}
}
@@ -1320,4 +1341,91 @@ public class TikaConfig {
}
}
+ private static class RendererXmlLoader
+ extends XmlLoader<Renderer, Renderer> {
+
+ boolean supportsComposite() {
+ return true;
+ }
+
+ String getParentTagName() {
+ return "renderers";
+ }
+
+ String getLoaderTagName() {
+ return "renderer";
+ }
+
+ @Override
+ Class<? extends Renderer> getLoaderClass() {
+ return Renderer.class;
+ }
+
+
+ @Override
+ boolean isComposite(Renderer loaded) {
+ return loaded instanceof CompositeRenderer;
+ }
+
+ @Override
+ boolean isComposite(Class<? extends Renderer> loadedClass) {
+ return CompositeRenderer.class.isAssignableFrom(loadedClass);
+ }
+
+ @Override
+ Renderer preLoadOne(Class<? extends Renderer> loadedClass, String classname,
+ MimeTypes mimeTypes) throws TikaException {
+ // Check for classes which can't be set in config
+ // Continue with normal loading
+ return null;
+ }
+
+ @Override
+ Renderer createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
+ return getDefaultRenderer(loader);
+ }
+
+ @Override
+ Renderer createComposite(List<Renderer> renderers,
+ MimeTypes mimeTypes, ServiceLoader loader) {
+ return new CompositeRenderer(renderers);
+ }
+
+ @Override
+ Renderer createComposite(Class<? extends Renderer> rendererClass,
+ List<Renderer> childRenderers,
+ Set<Class<? extends Renderer>> excludeRenderers,
+ Map<String, Param> params, MimeTypes mimeTypes,
+ ServiceLoader loader)
+ throws InvocationTargetException, IllegalAccessException, InstantiationException {
+ Renderer renderer = null;
+ Constructor<? extends Renderer> c;
+
+ // Try the possible default and composite detector constructors
+ if (renderer == null) {
+ try {
+ c = rendererClass.getConstructor(ServiceLoader.class, Collection.class);
+ renderer = c.newInstance(loader, excludeRenderers);
+ } catch (NoSuchMethodException me) {
+ LOG.debug("couldn't find constructor for service loader + collection for {}",
+ renderer);
+ }
+ }
+ if (renderer == null) {
+ try {
+ c = rendererClass.getConstructor(List.class);
+ renderer = c.newInstance(childRenderers);
+ } catch (NoSuchMethodException me) {
+ LOG.debug("couldn't find constructor for Renderer(List) for {}",
+ rendererClass);
+ }
+ }
+ return renderer;
+ }
+
+ @Override
+ Renderer decorate(Renderer created, Element element) {
+ return created; // No decoration of EncodingDetectors
+ }
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
new file mode 100644
index 000000000..1ff521aa7
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.tika.metadata;
+
+public interface Rendering {
+ String RENDERING_PREFIX = "rendering:";
+
+ Property PAGE_NUMBER = Property.externalInteger(RENDERING_PREFIX + "page_number");
+ Property RENDERED_BY = Property.externalTextBag(RENDERING_PREFIX + "Rendered-By");
+ Property RENDERED_MS = Property.externalInteger(RENDERING_PREFIX + "rendering-time-ms");
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index ba138c54f..21581a482 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -277,7 +277,7 @@ public interface TikaCoreProperties {
Property EMBEDDED_RESOURCE_TYPE = Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY,
EmbeddedResourceType.ATTACHMENT.toString(), EmbeddedResourceType.INLINE.toString(),
EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(),
- EmbeddedResourceType.THUMBNAIL.toString());
+ EmbeddedResourceType.THUMBNAIL.toString(), EmbeddedResourceType.RENDERING.toString());
Property HAS_SIGNATURE = Property.internalBoolean("hasSignature");
@@ -302,6 +302,7 @@ public interface TikaCoreProperties {
MACRO, //any code that is intended to be run by the application
METADATA, //e.g. xmp, xfa
FONT,//embedded font files
- THUMBNAIL//TODO: set this in parsers that handle thumbnails
+ THUMBNAIL, //TODO: set this in parsers that handle thumbnails
+ RENDERING //if a file has been rendered
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
index 2abeeed52..336adee93 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
@@ -27,6 +27,8 @@ import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.Renderer;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
@@ -46,25 +48,27 @@ public class DefaultParser extends CompositeParser {
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
Collection<Class<? extends Parser>> excludeParsers,
- EncodingDetector encodingDetector) {
- super(registry, getDefaultParsers(loader, encodingDetector, excludeParsers));
+ EncodingDetector encodingDetector, Renderer renderer) {
+ super(registry, getDefaultParsers(loader, encodingDetector, renderer, excludeParsers));
this.loader = loader;
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
Collection<Class<? extends Parser>> excludeParsers) {
super(registry,
- getDefaultParsers(loader, new DefaultEncodingDetector(loader), excludeParsers));
+ getDefaultParsers(loader, new DefaultEncodingDetector(loader),
+ new CompositeRenderer(loader), excludeParsers));
this.loader = loader;
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
- EncodingDetector encodingDetector) {
- this(registry, loader, Collections.EMPTY_SET, encodingDetector);
+ EncodingDetector encodingDetector, Renderer renderer) {
+ this(registry, loader, Collections.EMPTY_SET, encodingDetector, renderer);
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
- this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader));
+ this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader),
+ new CompositeRenderer(loader));
}
public DefaultParser(MediaTypeRegistry registry, ClassLoader loader) {
@@ -94,6 +98,7 @@ public class DefaultParser extends CompositeParser {
*/
private static List<Parser> getDefaultParsers(ServiceLoader loader,
EncodingDetector encodingDetector,
+ Renderer renderer,
Collection<Class<? extends Parser>>
excludeParsers) {
List<Parser> parsers =
@@ -104,6 +109,11 @@ public class DefaultParser extends CompositeParser {
setEncodingDetector(p, encodingDetector);
}
}
+ if (renderer != null) {
+ for (Parser p : parsers) {
+ setRenderer(p, renderer);
+ }
+ }
ServiceLoaderUtils.sortLoadedClasses(parsers);
return parsers;
}
@@ -122,6 +132,18 @@ public class DefaultParser extends CompositeParser {
}
}
+ private static void setRenderer(Parser p, Renderer renderer) {
+ if (p instanceof RenderingParser) {
+ ((RenderingParser) p).setRenderer(renderer);
+ } else if (p instanceof CompositeParser) {
+ for (Parser child : ((CompositeParser) p).getAllComponentParsers()) {
+ setRenderer(child, renderer);
+ }
+ } else if (p instanceof ParserDecorator) {
+ setRenderer(((ParserDecorator) p).getWrappedParser(), renderer);
+ }
+ }
+
@Override
public Map<MediaType, Parser> getParsers(ParseContext context) {
Map<MediaType, Parser> map = super.getParsers(context);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
new file mode 100644
index 000000000..0daae6be1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */package org.apache.tika.parser;
+
+import org.apache.tika.renderer.Renderer;
+
+public interface RenderingParser {
+
+ void setRenderer(Renderer renderer);
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
new file mode 100644
index 000000000..ff5a52061
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */package org.apache.tika.renderer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+public class CompositeRenderer implements Renderer, Initializable {
+
+ private Map<MediaType, Renderer> rendererMap = new HashMap<>();
+ private List<Renderer> renderers;
+
+ public CompositeRenderer(ServiceLoader serviceLoader) {
+ this(getDefaultRenderers(serviceLoader));
+ }
+
+ public CompositeRenderer(List<Renderer> renderers) {
+ this.renderers = renderers;
+ }
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return rendererMap.keySet();
+ }
+
+ @Override
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext)
+ throws IOException, TikaException {
+ String mediaTypeString = metadata.get(TikaCoreProperties.TYPE);
+ if (mediaTypeString == null) {
+ throw new TikaException("need to specify file type in metadata");
+ }
+ MediaType mt = MediaType.parse(mediaTypeString);
+ if (mt == null) {
+ throw new TikaException("can't parse mediaType: " + mediaTypeString);
+ }
+ Renderer renderer = rendererMap.get(mt);
+ if (renderer == null) {
+ throw new TikaException("I regret I can't find a renderer for " + mt);
+ }
+ return renderer.render(is, metadata, parseContext);
+ }
+
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+ Map<MediaType, Renderer> tmp = new ConcurrentHashMap<>();
+ ParseContext empty = new ParseContext();
+ for (Renderer renderer : renderers) {
+ for (MediaType mt : renderer.getSupportedTypes(empty)) {
+ tmp.put(mt, renderer);
+ }
+ }
+ rendererMap = Collections.unmodifiableMap(tmp);
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+
+ }
+
+ private static List<Renderer> getDefaultRenderers(ServiceLoader loader) {
+ List<Renderer> staticRenderers =
+ loader.loadStaticServiceProviders(Renderer.class);
+
+ ServiceLoaderUtils.sortLoadedClasses(staticRenderers);
+ return staticRenderers;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
new file mode 100644
index 000000000..9ed61e342
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.nio.file.Path;
+
+import org.apache.tika.metadata.Metadata;
+
+public class RenderResult {
+
+ public enum STATUS {
+ SUCCESS,
+ EXCEPTION,
+ TIMEOUT
+ }
+ private final STATUS status;
+
+ private final Path path;
+ private final Metadata metadata;
+
+ public RenderResult(STATUS status, Path path, Metadata metadata) {
+ this.status = status;
+ this.path = path;
+ this.metadata = metadata;
+ }
+
+ public Path getPath() {
+ return path;
+ }
+
+ public Metadata getMetadata() {
+ return metadata;
+ }
+
+ public STATUS getStatus() {
+ return status;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
new file mode 100644
index 000000000..12d60d3da
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.io.TemporaryResources;
+
+public class RenderResults implements Closeable {
+
+ private List<RenderResult> results = new ArrayList<>();
+
+ private final TemporaryResources tmp;
+ public RenderResults(TemporaryResources tmp) {
+ this.tmp = tmp;
+ }
+ public void add(RenderResult result) {
+ tmp.addResource(new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete(result.getPath());
+ }
+ });
+ results.add(result);
+ }
+
+ public List<RenderResult> getResults() {
+ return results;
+ }
+
+ @Override
+ public void close() throws IOException {
+ tmp.close();
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
new file mode 100644
index 000000000..5f7cb536b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages
+ * but also on portions of PDF pages as well as on other document types.
+ *
+ */
+public interface Renderer extends Serializable {
+
+
+ /**
+ * Returns the set of media types supported by this renderer when used
+ * with the given parse context.
+ *
+ * @param context parse context
+ * @return immutable set of media types
+ * @since Apache Tika 2.5.0
+ */
+ Set<MediaType> getSupportedTypes(ParseContext context);
+
+ RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext) throws IOException,
+ TikaException;
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 93dfbd119..02449d53d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -109,6 +109,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.RenderResults;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -140,6 +141,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final PDFParserConfig config;
final Parser ocrParser;
+
+ final RenderResults renderResults;
/**
* Format used for signature dates
* TODO Make this thread-safe
@@ -157,11 +160,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int totalCharsPerPage = 0;
AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context,
- Metadata metadata, PDFParserConfig config) throws IOException {
+ Metadata metadata, RenderResults renderResults, PDFParserConfig config) throws IOException {
this.pdDocument = pdDocument;
this.xhtml = new XHTMLContentHandler(handler, metadata);
this.context = context;
this.metadata = metadata;
+ this.renderResults = renderResults;
this.config = config;
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (config.getOcrStrategy() == NO_OCR) {
@@ -482,6 +486,23 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ Path tmpFile = renderPage(tmp);
+
+ try (InputStream is = TikaInputStream.get(tmpFile)) {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+ ocrImageMediaType.toString());
+ ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ metadata, context);
+ }
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ } catch (SAXException e) {
+ throw new IOException("error writing OCR content from PDF", e);
+ }
+ }
+
+ private Path renderPage(TemporaryResources tmpResources) {
PDFRenderer renderer =
config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT ?
new NoTextPDFRenderer(pdDocument) : new PDFRenderer(pdDocument);
@@ -509,17 +530,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
ExceptionUtils.getStackTrace(e));
return;
}
- try (InputStream is = TikaInputStream.get(tmpFile)) {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
- ocrImageMediaType.toString());
- ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- metadata, context);
- }
- } catch (IOException e) {
- handleCatchableIOE(e);
- } catch (SAXException e) {
- throw new IOException("error writing OCR content from PDF", e);
- }
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 7493253bb..1930a1ae9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -29,6 +29,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.RenderResults;
/**
@@ -38,7 +39,7 @@ import org.apache.tika.parser.ParseContext;
class OCR2XHTML extends AbstractPDF2XHTML {
private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, PDFParserConfig config) throws IOException {
+ Metadata metadata, RenderResults renderResults, PDFParserConfig config) throws IOException {
super(document, handler, context, metadata, config);
}
@@ -53,11 +54,12 @@ class OCR2XHTML extends AbstractPDF2XHTML {
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, PDFParserConfig config)
+ Metadata metadata, RenderResults renderResults,
+ PDFParserConfig config)
throws SAXException, TikaException {
OCR2XHTML ocr2XHTML = null;
try {
- ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
+ ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, renderResults, config);
ocr2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 93d1b7e81..2b4f1368a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -40,6 +40,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.RenderResults;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -64,8 +65,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
- PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, config);
+ RenderResults renderResults, PDFParserConfig config) throws IOException {
+ super(document, handler, context, metadata, renderResults, config);
}
/**
@@ -79,7 +80,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, PDFParserConfig config)
+ Metadata metadata, RenderResults renderResults,
+ PDFParserConfig config)
throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
try {
@@ -90,7 +92,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
pdf2XHTML =
new AngleDetectingPDF2XHTML(document, handler, context, metadata, config);
} else {
- pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
+ pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, renderResults,
+ config);
}
config.configure(pdf2XHTML);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 3835179b1..79b79f14f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -64,6 +64,9 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RenderingParser;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -96,7 +99,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* If your PDFs contain marked content or tags, consider
* {@link PDFParserConfig#setExtractMarkedContent(boolean)}
*/
-public class PDFParser extends AbstractParser implements Initializable {
+public class PDFParser extends AbstractParser implements RenderingParser, Initializable {
/**
* Metadata key for giving the document password to the parser.
@@ -149,12 +152,17 @@ public class PDFParser extends AbstractParser implements Initializable {
pdfDocument = getPDDocument(new CloseShieldInputStream(stream), password,
memoryUsageSetting, metadata, context);
}
+ tstream.setOpenContainer(pdfDocument);
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
+ RenderResults renderResults = null;
+ if (localConfig.getRenderer().getSupportedTypes(context).contains(MEDIA_TYPE)) {
+ renderResults = renderPDF(tstream, context, localConfig);
+ }
if (handler != null) {
boolean hasXFA = hasXFA(pdfDocument);
metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
@@ -166,12 +174,15 @@ public class PDFParser extends AbstractParser implements Initializable {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy()
.equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
- OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ OCR2XHTML.process(pdfDocument, handler, context, metadata, renderResults,
+ localConfig);
} else if (hasMarkedContent && localConfig.isExtractMarkedContent()) {
PDFMarkedContent2XHTML
- .process(pdfDocument, handler, context, metadata, localConfig);
+ .process(pdfDocument, handler, context, metadata, renderResults,
+ localConfig);
} else {
- PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ PDF2XHTML.process(pdfDocument, handler, context, metadata, renderResults,
+ localConfig);
}
}
} catch (InvalidPasswordException e) {
@@ -184,6 +195,14 @@ public class PDFParser extends AbstractParser implements Initializable {
}
}
+ private RenderResults renderPDF(TikaInputStream tstream,
+ ParseContext parseContext, PDFParserConfig localConfig)
+ throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TYPE, MEDIA_TYPE.toString());
+ return localConfig.getRenderer().render(tstream, metadata, parseContext);
+ }
+
protected PDDocument getPDDocument(InputStream inputStream, String password,
MemoryUsageSetting memoryUsageSetting, Metadata metadata,
@@ -622,6 +641,11 @@ public class PDFParser extends AbstractParser implements Initializable {
//no-op
}
+ @Override
+ public void setRenderer(Renderer renderer) {
+ defaultConfig.setRenderer(renderer);
+ }
+
/**
* Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan
* widgets, which we don't need.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index af0e9617b..3b80ac061 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -29,6 +29,7 @@ import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.renderer.Renderer;
/**
* Config for PDFParser.
@@ -130,6 +131,8 @@ public class PDFParserConfig implements Serializable {
private boolean detectAngles = false;
+ private Renderer renderer;
+
/**
* @return whether or not to extract only inline image metadata and not render the images
*/
@@ -857,6 +860,14 @@ public class PDFParserConfig implements Serializable {
return getMaxMainMemoryBytes() == config.getMaxMainMemoryBytes();
}
+ public void setRenderer(Renderer renderer) {
+ this.renderer = renderer;
+ }
+
+ public Renderer getRenderer() {
+ return renderer;
+ }
+
@Override
public int hashCode() {
int result = (isEnableAutoSpace() ? 1 : 0);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
new file mode 100644
index 000000000..4f7a1d149
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+
+public class PDFBoxRenderer implements Renderer {
+
+ Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+
+ /**
+ * This is the amount of time it takes for PDFBox to render the page
+ */
+ public static Property PDFBOX_RENDERING_TIME_MS =
+ Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-rendering-ms");
+
+ /**
+ * This is the amount of time it takes for PDFBox/java to write the image after
+ * it has been rendered into a BufferedImage. Some formats take much longer
+ * to encode than others.
+ */
+ public static Property PDFBOX_IMAGE_WRITING_TIME_MS =
+ Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-image-writing-ms");
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ private int dpi = 300;
+ private ImageType imageType = ImageType.GRAY;
+ private String imageFormatName = "tiff";
+
+ @Override
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext) throws IOException,
+ TikaException {
+
+
+ PDDocument pdDocument;
+ TikaInputStream tis = TikaInputStream.get(is);
+ boolean mustClose = false;
+ if (tis.getOpenContainer() != null) {
+ pdDocument = (PDDocument) tis.getOpenContainer();
+ } else {
+ pdDocument = PDDocument.load(is);
+ mustClose = true;
+ }
+ RenderResults results = new RenderResults(new TemporaryResources());
+ try {
+
+ PDFRenderer renderer = new PDFRenderer(pdDocument);
+
+ for (int i = 0; i < pdDocument.getNumberOfPages(); i++) {
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
+ try {
+ m.set(Rendering.PAGE_NUMBER, i + 1);
+ Path imagePath = renderPage(renderer, i, m);
+ results.add(new RenderResult(RenderResult.STATUS.SUCCESS, imagePath, m));
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, m);
+ results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, null, m));
+ }
+ }
+ } finally {
+ if (mustClose) {
+ pdDocument.close();
+ }
+ }
+ return results;
+ }
+
+ private Path renderPage(PDFRenderer renderer, int pageIndex, Metadata metadata)
+ throws IOException {
+
+ Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
+ "-" + (pageIndex + 1) + "." + imageFormatName);
+ try {
+ long start = System.currentTimeMillis();
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, imageType);
+ long renderingElapsed = System.currentTimeMillis() - start;
+ metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
+ start = System.currentTimeMillis();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ ImageIOUtil.writeImage(image, imageFormatName, os, dpi);
+ }
+ long elapsedWrite = System.currentTimeMillis() - start;
+ metadata.set(PDFBOX_IMAGE_WRITING_TIME_MS, elapsedWrite);
+ metadata.set(Rendering.RENDERED_MS, renderingElapsed + elapsedWrite);
+ } catch (SecurityException e) {
+ //throw SecurityExceptions immediately
+ throw e;
+ } catch (IOException | RuntimeException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ return tmpFile;
+ }
+
+}