You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/02 00:43:30 UTC
tika git commit: TIKA-1992 -- check for duplicat images via COSStream
not object name.
Repository: tika
Updated Branches:
refs/heads/master 6ad18f44c -> 40f8ec95b
TIKA-1992 -- check for duplicat images via COSStream not object name.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/40f8ec95
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/40f8ec95
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/40f8ec95
Branch: refs/heads/master
Commit: 40f8ec95b1c1c2e1713d26e0780b3bd2f97c9bb1
Parents: 6ad18f4
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 1 20:43:23 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 1 20:43:23 2016 -0400
----------------------------------------------------------------------
.../src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/40f8ec95/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index fec6a79..207917b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -42,6 +42,7 @@ import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -126,7 +127,7 @@ class PDF2XHTML extends PDFTextStripper {
* This is used across the document. To avoid infinite recursion
* TIKA-1742, we're limiting the export to one image per page.
*/
- private Map<String, Integer> processedInlineImages = new HashMap<>();
+ private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private int inlineImageCounter = 0;
private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config)
@@ -411,11 +412,11 @@ class PDF2XHTML extends PDFTextStripper {
//Do we only want to process unique COSObject ids?
//If so, have we already processed this one?
if (config.getExtractUniqueInlineImagesOnly() == true) {
- String cosObjectId = name.getName();
- if (processedInlineImages.containsKey(cosObjectId)) {
+ COSStream cosStream = object.getCOSObject();
+ if (processedInlineImages.containsKey(cosStream)) {
continue;
}
- processedInlineImages.put(cosObjectId, imageNumber);
+ processedInlineImages.put(cosStream, imageNumber);
}
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,