You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/09/30 19:05:15 UTC

svn commit: r1706086 - in /tika/trunk: CHANGES.txt tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java

Author: tallison
Date: Wed Sep 30 17:05:14 2015
New Revision: 1706086

URL: http://svn.apache.org/viewvc?rev=1706086&view=rev
Log:
TIKA-1742 prevent infinite recursion while processing inline images in PDFs by limiting extraction to unique images per page...following Tilman Hausherr's solution on PDFBox

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1706086&r1=1706085&r2=1706086&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Sep 30 17:05:14 2015
@@ -1,4 +1,9 @@
 Release 1.11 - Current Development
+  
+  * Prevent infinite recursion when processing inline images
+    in PDF files by limiting extraction of duplicate images
+    within the same page (TIKA-1742).
+
   * Upgrade to POI 3.13-final (via Andreas Beeker) (TIKA-1707).
 
   * Upgraded tika-batch to use Path throughout (TIKA-1747 and

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1706086&r1=1706085&r2=1706086&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Wed Sep 30 17:05:14 2015
@@ -23,14 +23,17 @@ import java.io.Writer;
 import java.text.SimpleDateFormat;
 import java.util.Calendar;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 
 import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -98,12 +101,16 @@ class PDF2XHTML extends PDFTextStripper
     private final PDFParserConfig config;
     /**
      * This keeps track of the pdf object ids for inline
-     * images that have been processed.  If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
+     * images that have been processed.
+     * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
      * is true, this will be checked before extracting an embedded image.
      * The integer keeps track of the inlineImageCounter for that image.
      * This integer is used to identify images in the markup.
+     *
+     * This is used across the document.  To avoid infinite recursion
+     * TIKA-1742, we're limiting the export to one image per page.
      */
-    private Map<String, Integer> processedInlineImages = new HashMap<String, Integer>();
+    private Map<String, Integer> processedInlineImages = new HashMap<>();
     private int inlineImageCounter = 0;
     private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
                       PDFParserConfig config)
@@ -227,7 +234,7 @@ class PDF2XHTML extends PDFTextStripper
         try {
             writeParagraphEnd();
 
-            extractImages(page.getResources());
+            extractImages(page.getResources(), new HashSet<COSBase>());
 
             EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
             for (PDAnnotation annotation : page.getAnnotations()) {
@@ -302,7 +309,7 @@ class PDF2XHTML extends PDFTextStripper
         page.clear();
     }
 
-    private void extractImages(PDResources resources) throws SAXException {
+    private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException {
         if (resources == null || config.getExtractInlineImages() == false) {
             return;
         }
@@ -315,8 +322,18 @@ class PDF2XHTML extends PDFTextStripper
         for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) {
 
             PDXObject object = entry.getValue();
+            if (object == null) {
+                continue;
+            }
+            COSBase cosObject = object.getCOSObject();
+            if (seenThisPage.contains(cosObject)) {
+                //avoid infinite recursion TIKA-1742
+                continue;
+            }
+            seenThisPage.add(cosObject);
+
             if (object instanceof PDXObjectForm) {
-                extractImages(((PDXObjectForm) object).getResources());
+                extractImages(((PDXObjectForm) object).getResources(), seenThisPage);
             } else if (object instanceof PDXObjectImage) {
 
                 PDXObjectImage image = (PDXObjectImage) object;

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1706086&r1=1706085&r2=1706086&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Wed Sep 30 17:05:14 2015
@@ -236,6 +236,10 @@ public class PDFParserConfig implements
      * <p/>
      * For this parameter to have any effect, {@link #extractInlineImages} must be
      * set to <code>true</code>.
+     * <p>
+     * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting
+     * of this parameter, the extractor will only pull out one copy of one image per
+     * page.  This parameter tries to capture uniqueness across the entire document.
      *
      * @param extractUniqueInlineImagesOnly
      */