You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/01 16:35:46 UTC
svn commit: r1628715 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Author: tallison
Date: Wed Oct 1 14:35:46 2014
New Revision: 1628715
URL: http://svn.apache.org/r1628715
Log:
TIKA-1427, small clean up to ensure that inline image number tracks with extracted file
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1628715&r1=1628714&r2=1628715&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Wed Oct 1 14:35:46 2014
@@ -22,12 +22,11 @@ import java.io.IOException;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
-import java.util.HashSet;
+import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
-import java.util.Set;
import java.util.TreeMap;
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -103,8 +102,10 @@ class PDF2XHTML extends PDFTextStripper
* This keeps track of the pdf object ids for inline
* images that have been processed. If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
* is true, this will be checked before extracting an embedded image.
+ * The integer keeps track of the inlineImageCounter for that image.
+ * This integer is used to identify images in the markup.
*/
- private Set<String> processedInlineImages = new HashSet<String>();
+ private Map<String, Integer> processedInlineImages = new HashMap<String, Integer>();
private int inlineImageCounter = 0;
@@ -339,7 +340,12 @@ class PDF2XHTML extends PDFTextStripper
metadata.set(Metadata.CONTENT_TYPE, "image/png");
extension = ".png";
}
- String fileName = "image"+inlineImageCounter+++extension;
+
+ Integer imageNumber = processedInlineImages.get(entry.getKey());
+ if (imageNumber == null) {
+ imageNumber = inlineImageCounter++;
+ }
+ String fileName = "image"+imageNumber+extension;
metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
// Output the img tag
@@ -353,10 +359,10 @@ class PDF2XHTML extends PDFTextStripper
//If so, have we already processed this one?
if (config.getExtractUniqueInlineImagesOnly() == true) {
String cosObjectId = entry.getKey();
- if (processedInlineImages.contains(cosObjectId)){
+ if (processedInlineImages.containsKey(cosObjectId)){
continue;
}
- processedInlineImages.add(cosObjectId);
+ processedInlineImages.put(cosObjectId, imageNumber);
}
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,