You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/01 16:11:49 UTC
svn commit: r1628707 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: tallison
Date: Wed Oct 1 14:11:49 2014
New Revision: 1628707
URL: http://svn.apache.org/r1628707
Log:
TIKA-1427 cleanup. Handle inline images with same markup as Word parser
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1628707&r1=1628706&r2=1628707&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Wed Oct 1 14:11:49 2014
@@ -324,28 +324,42 @@ class PDF2XHTML extends PDFTextStripper
if (object instanceof PDXObjectForm) {
extractImages(((PDXObjectForm) object).getResources());
} else if (object instanceof PDXObjectImage) {
-
- //Do we only want to process unique COSObject ids?
- //If so, have we already processed this one?
- if (config.getExtractUniqueInlineImagesOnly() == true) {
- String cosObjectId = entry.getKey();
- if (processedInlineImages.contains(cosObjectId)){
- continue;
- }
- processedInlineImages.add(cosObjectId);
- }
PDXObjectImage image = (PDXObjectImage) object;
Metadata metadata = new Metadata();
+ String extension = "";
if (image instanceof PDJpeg) {
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ extension = ".jpg";
} else if (image instanceof PDCcitt) {
metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+ extension = ".tif";
} else if (image instanceof PDPixelMap) {
metadata.set(Metadata.CONTENT_TYPE, "image/png");
+ extension = ".png";
+ }
+ String fileName = "image"+inlineImageCounter+++extension;
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+
+ // Output the img tag
+ AttributesImpl attr = new AttributesImpl();
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+ attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+ handler.startElement("img", attr);
+ handler.endElement("img");
+
+ //Do we only want to process unique COSObject ids?
+ //If so, have we already processed this one?
+ if (config.getExtractUniqueInlineImagesOnly() == true) {
+ String cosObjectId = entry.getKey();
+ if (processedInlineImages.contains(cosObjectId)){
+ continue;
+ }
+ processedInlineImages.add(cosObjectId);
}
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
EmbeddedDocumentExtractor extractor =
@@ -359,14 +373,6 @@ class PDF2XHTML extends PDFTextStripper
new ByteArrayInputStream(buffer.toByteArray()),
new EmbeddedContentHandler(handler),
metadata, false);
-
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", Integer.toString(inlineImageCounter++));
- attributes.addAttribute("", "inline_image", "inline_image", "CDATA", "true");
- handler.startElement("div", attributes);
- handler.endElement("div");
-
} catch (IOException e) {
// could not extract this image, so just skip it...
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1628707&r1=1628706&r2=1628707&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Wed Oct 1 14:11:49 2014
@@ -695,7 +695,7 @@ public class PDFParserTest extends TikaT
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
- assertNull(metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
@@ -1055,7 +1055,7 @@ public class PDFParserTest extends TikaT
//regular attachment
assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
//inline image
- assertContains("<div class=\"embedded\" id=\"0\" inline_image=\"true\" />", xml);
+ assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);
//doc embedded inside an annotation
xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;