You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/09/30 04:39:27 UTC
svn commit: r1628354 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: tallison
Date: Tue Sep 30 02:39:26 2014
New Revision: 1628354
URL: http://svn.apache.org/r1628354
Log:
TIKA-1427: add markup for documents embedded in pdfs
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1628354&r1=1628353&r2=1628354&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Sep 30 02:39:26 2014
@@ -1,4 +1,7 @@
Release 1.7 - Current Development
+ * Add markup for files embedded in PDFs (TIKA-1427).
+
+ * Extract files embedded in annotations in PDFS (TIKA-1433).
* Upgrade to PDFBox 1.8.7 (TIKA-1419).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1628354&r1=1628353&r2=1628354&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Tue Sep 30 02:39:26 2014
@@ -106,6 +106,7 @@ class PDF2XHTML extends PDFTextStripper
*/
private Set<String> processedInlineImages = new HashSet<String>();
+ private int inlineImageCounter = 0;
/**
* Converts the given PDF document (and related metadata) to a stream
@@ -358,6 +359,14 @@ class PDF2XHTML extends PDFTextStripper
new ByteArrayInputStream(buffer.toByteArray()),
new EmbeddedContentHandler(handler),
metadata, false);
+
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", Integer.toString(inlineImageCounter++));
+ attributes.addAttribute("", "inline_image", "inline_image", "CDATA", "true");
+ handler.startElement("div", attributes);
+ handler.endElement("div");
+
} catch (IOException e) {
// could not extract this image, so just skip it...
}
@@ -536,6 +545,12 @@ class PDF2XHTML extends PDFTextStripper
stream,
new EmbeddedContentHandler(handler),
metadata, false);
+
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", fileName);
+ handler.startElement("div", attributes);
+ handler.endElement("div");
} finally {
IOUtils.closeQuietly(stream);
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1628354&r1=1628353&r2=1628354&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Sep 30 02:39:26 2014
@@ -16,10 +16,21 @@
*/
package org.apache.tika.parser.pdf;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -33,19 +44,10 @@ import org.apache.tika.parser.RecursiveP
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
@@ -1026,6 +1028,40 @@ public class PDFParserTest extends TikaT
}
+ @Test //TIKA-1427
+ public void testEmbeddedFileMarkup() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(org.apache.tika.parser.Parser.class, parser);
+
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractInlineImages(true);
+ config.setExtractUniqueInlineImagesOnly(false);
+ context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
+
+
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new ToXMLContentHandler();
+ String path = "/test-documents/testPDF_childAttachments.pdf";
+ InputStream stream = null;
+ try {
+ stream = TikaInputStream.get(this.getClass().getResource(path));
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+
+ String xml = handler.toString();
+ //regular attachment
+ assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
+ //inline image
+ assertContains("<div class=\"embedded\" id=\"0\" inline_image=\"true\" />", xml);
+
+ //doc embedded inside an annotation
+ xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
+ assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
+ }
+
/**
*
* Simple class to count end of document events. If functionality is useful,