You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/09/30 19:05:15 UTC
svn commit: r1706086 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
Author: tallison
Date: Wed Sep 30 17:05:14 2015
New Revision: 1706086
URL: http://svn.apache.org/viewvc?rev=1706086&view=rev
Log:
TIKA-1742 prevent infinite recursion while processing inline images in PDFs by limiting extraction to unique images per page...following Tilman Hausherr's solution on PDFBox
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1706086&r1=1706085&r2=1706086&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Sep 30 17:05:14 2015
@@ -1,4 +1,9 @@
Release 1.11 - Current Development
+
+ * Prevent infinite recursion when processing inline images
+ in PDF files by limiting extraction of duplicate images
+ within the same page (TIKA-1742).
+
* Upgrade to POI 3.13-final (via Andreas Beeker) (TIKA-1707).
* Upgraded tika-batch to use Path throughout (TIKA-1747 and
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1706086&r1=1706085&r2=1706086&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Wed Sep 30 17:05:14 2015
@@ -23,14 +23,17 @@ import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -98,12 +101,16 @@ class PDF2XHTML extends PDFTextStripper
private final PDFParserConfig config;
/**
* This keeps track of the pdf object ids for inline
- * images that have been processed. If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
+ * images that have been processed.
+ * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
* is true, this will be checked before extracting an embedded image.
* The integer keeps track of the inlineImageCounter for that image.
* This integer is used to identify images in the markup.
+ *
+ * This is used across the document. To avoid infinite recursion
+ * TIKA-1742, we're limiting the export to one image per page.
*/
- private Map<String, Integer> processedInlineImages = new HashMap<String, Integer>();
+ private Map<String, Integer> processedInlineImages = new HashMap<>();
private int inlineImageCounter = 0;
private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config)
@@ -227,7 +234,7 @@ class PDF2XHTML extends PDFTextStripper
try {
writeParagraphEnd();
- extractImages(page.getResources());
+ extractImages(page.getResources(), new HashSet<COSBase>());
EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
for (PDAnnotation annotation : page.getAnnotations()) {
@@ -302,7 +309,7 @@ class PDF2XHTML extends PDFTextStripper
page.clear();
}
- private void extractImages(PDResources resources) throws SAXException {
+ private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException {
if (resources == null || config.getExtractInlineImages() == false) {
return;
}
@@ -315,8 +322,18 @@ class PDF2XHTML extends PDFTextStripper
for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) {
PDXObject object = entry.getValue();
+ if (object == null) {
+ continue;
+ }
+ COSBase cosObject = object.getCOSObject();
+ if (seenThisPage.contains(cosObject)) {
+ //avoid infinite recursion TIKA-1742
+ continue;
+ }
+ seenThisPage.add(cosObject);
+
if (object instanceof PDXObjectForm) {
- extractImages(((PDXObjectForm) object).getResources());
+ extractImages(((PDXObjectForm) object).getResources(), seenThisPage);
} else if (object instanceof PDXObjectImage) {
PDXObjectImage image = (PDXObjectImage) object;
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1706086&r1=1706085&r2=1706086&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Wed Sep 30 17:05:14 2015
@@ -236,6 +236,10 @@ public class PDFParserConfig implements
* <p/>
* For this parameter to have any effect, {@link #extractInlineImages} must be
* set to <code>true</code>.
+ * <p>
+ * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting
+ * of this parameter, the extractor will only pull out one copy of one image per
+ * page. This parameter tries to capture uniqueness across the entire document.
*
* @param extractUniqueInlineImagesOnly
*/