You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/02 01:21:36 UTC
tika git commit: TIKA-1992 -- check for duplicate inline images by
COSStream not object name.
Repository: tika
Updated Branches:
refs/heads/2.x e05dd5bf4 -> e5a7604bc
TIKA-1992 -- check for duplicate inline images by COSStream not object name.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e5a7604b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e5a7604b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e5a7604b
Branch: refs/heads/2.x
Commit: e5a7604bcbdd92d2a9dd8c216707f0a2933e8852
Parents: e05dd5b
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 1 21:21:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 1 21:21:29 2016 -0400
----------------------------------------------------------------------
tika-app/pom.xml | 8 ++++++++
.../tika/parser/AutoDetectParserTest.java | 2 +-
tika-parent/pom.xml | 9 +++++++++
.../tika-parser-pdf-module/pom.xml | 1 -
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 20 ++++++++++----------
5 files changed, 28 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/e5a7604b/tika-app/pom.xml
----------------------------------------------------------------------
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 9177afb..c51bcb1 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -116,6 +116,14 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
+ <!-- Copied from PDFBox:
+ For legal reasons (incompatible license), jai-imageio-core is to be used
+ only in the tests and may not be distributed. See also LEGAL-195 -->
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/e5a7604b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 4f312a9..2c8384d 100644
--- a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -393,7 +393,7 @@ public class AutoDetectParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveJson("testPDF_childAttachments.pdf", context);
//sanity check
- assertEquals(4, metadataList.size());
+ assertEquals(5, metadataList.size());
//inlined jpeg metadata
Metadata jpegMetadata = metadataList.get(1);
http://git-wip-us.apache.org/repos/asf/tika/blob/e5a7604b/tika-parent/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 5f7e76a..da75ecd 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -351,6 +351,15 @@
<version>1</version>
<scope>test</scope>
</dependency>
+ <!-- Copied from PDFBox:
+ For legal reasons (incompatible license), jai-imageio-core is to be used
+ only in the tests and may not be distributed. See also LEGAL-195 -->
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <version>1.3.1</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</dependencyManagement>
http://git-wip-us.apache.org/repos/asf/tika/blob/e5a7604b/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index e1c6a3e..2156b95 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -105,7 +105,6 @@
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
- <version>1.3.1</version>
<scope>test</scope>
</dependency>
</dependencies>
http://git-wip-us.apache.org/repos/asf/tika/blob/e5a7604b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index bc0bf96..e98bead 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -42,6 +42,7 @@ import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -126,7 +127,7 @@ class PDF2XHTML extends PDFTextStripper {
* This is used across the document. To avoid infinite recursion
* TIKA-1742, we're limiting the export to one image per page.
*/
- private Map<String, Integer> processedInlineImages = new HashMap<>();
+ private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private int inlineImageCounter = 0;
private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config)
@@ -274,7 +275,7 @@ class PDF2XHTML extends PDFTextStripper {
try {
writeParagraphEnd();
try {
- extractImages(page.getResources(), new HashSet<COSBase>());
+ extractImages(page.getResources(), new HashSet<COSStream>());
} catch (IOException e) {
handleCatchableIOE(e);
}
@@ -355,7 +356,7 @@ class PDF2XHTML extends PDFTextStripper {
}
}
- private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
+ private void extractImages(PDResources resources, Set<COSStream> seenThisPage) throws SAXException, IOException {
if (resources == null || config.getExtractInlineImages() == false) {
return;
}
@@ -366,12 +367,12 @@ class PDF2XHTML extends PDFTextStripper {
if (object == null) {
continue;
}
- COSBase cosObject = object.getCOSObject();
- if (seenThisPage.contains(cosObject)) {
+ COSStream cosStream = object.getCOSObject();
+ if (seenThisPage.contains(cosStream)) {
//avoid infinite recursion TIKA-1742
continue;
}
- seenThisPage.add(cosObject);
+ seenThisPage.add(cosStream);
if (object instanceof PDFormXObject) {
extractImages(((PDFormXObject) object).getResources(), seenThisPage);
@@ -394,7 +395,7 @@ class PDF2XHTML extends PDFTextStripper {
//throw new RuntimeException("EXTEN:" + extension);
}
- Integer imageNumber = processedInlineImages.get(name.getName());
+ Integer imageNumber = processedInlineImages.get(object.getCOSObject());
if (imageNumber == null) {
imageNumber = inlineImageCounter++;
}
@@ -411,11 +412,10 @@ class PDF2XHTML extends PDFTextStripper {
//Do we only want to process unique COSObject ids?
//If so, have we already processed this one?
if (config.getExtractUniqueInlineImagesOnly() == true) {
- String cosObjectId = name.getName();
- if (processedInlineImages.containsKey(cosObjectId)) {
+ if (processedInlineImages.containsKey(cosStream)) {
continue;
}
- processedInlineImages.put(cosObjectId, imageNumber);
+ processedInlineImages.put(cosStream, imageNumber);
}
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,