You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/01 16:11:49 UTC

svn commit: r1628707 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/PDF2XHTML.java test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Author: tallison
Date: Wed Oct  1 14:11:49 2014
New Revision: 1628707

URL: http://svn.apache.org/r1628707
Log:
TIKA-1427 cleanup. Handle inline images with same markup as Word parser

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1628707&r1=1628706&r2=1628707&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Wed Oct  1 14:11:49 2014
@@ -324,28 +324,42 @@ class PDF2XHTML extends PDFTextStripper 
             if (object instanceof PDXObjectForm) {
                 extractImages(((PDXObjectForm) object).getResources());
             } else if (object instanceof PDXObjectImage) {
-                
-                //Do we only want to process unique COSObject ids?
-                //If so, have we already processed this one?
-                if (config.getExtractUniqueInlineImagesOnly() == true) {
-                    String cosObjectId = entry.getKey();
-                    if (processedInlineImages.contains(cosObjectId)){
-                        continue;
-                    }
-                    processedInlineImages.add(cosObjectId);
-                }
 
                 PDXObjectImage image = (PDXObjectImage) object;
 
                 Metadata metadata = new Metadata();
+                String extension = "";
                 if (image instanceof PDJpeg) {
                     metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+                    extension = ".jpg";
                 } else if (image instanceof PDCcitt) {
                     metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+                    extension = ".tif";
                 } else if (image instanceof PDPixelMap) {
                     metadata.set(Metadata.CONTENT_TYPE, "image/png");
+                    extension = ".png";
+                }
+                String fileName = "image"+inlineImageCounter+++extension;
+                metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+
+                // Output the img tag
+                AttributesImpl attr = new AttributesImpl();
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+                attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+                handler.startElement("img", attr);
+                handler.endElement("img");
+
+                //Do we only want to process unique COSObject ids?
+                //If so, have we already processed this one?
+                if (config.getExtractUniqueInlineImagesOnly() == true) {
+                    String cosObjectId = entry.getKey();
+                    if (processedInlineImages.contains(cosObjectId)){
+                        continue;
+                    }
+                    processedInlineImages.add(cosObjectId);
                 }
-                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
+
+                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                         TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
 
                 EmbeddedDocumentExtractor extractor =
@@ -359,14 +373,6 @@ class PDF2XHTML extends PDFTextStripper 
                                 new ByteArrayInputStream(buffer.toByteArray()),
                                 new EmbeddedContentHandler(handler),
                                 metadata, false);
-                        
-                        AttributesImpl attributes = new AttributesImpl();
-                        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                        attributes.addAttribute("", "id", "id", "CDATA", Integer.toString(inlineImageCounter++));
-                        attributes.addAttribute("", "inline_image", "inline_image", "CDATA", "true");
-                        handler.startElement("div", attributes);
-                        handler.endElement("div");
-
                     } catch (IOException e) {
                         // could not extract this image, so just skip it...
                     }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1628707&r1=1628706&r2=1628707&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Wed Oct  1 14:11:49 2014
@@ -695,7 +695,7 @@ public class PDFParserTest extends TikaT
 
         assertEquals(5, metadatas.size());
         assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
-        assertNull(metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
         assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
         assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
         assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
@@ -1055,7 +1055,7 @@ public class PDFParserTest extends TikaT
         //regular attachment
         assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
         //inline image
-        assertContains("<div class=\"embedded\" id=\"0\" inline_image=\"true\" />", xml);
+        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);
 
         //doc embedded inside an annotation
         xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;