You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2014/09/09 15:01:21 UTC

svn commit: r1623819 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ooxml/ test/java/org/apache/tika/parser/microsoft/ooxml/ test/java/org/apache/tika/parser/rtf/

Author: thaichat04
Date: Tue Sep  9 13:01:21 2014
New Revision: 1623819

URL: http://svn.apache.org/r1623819
Log:
TIKA-1413 - Remove embedded thumbnail from body

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1623819&r1=1623818&r2=1623819&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Tue Sep  9 13:01:21 2014
@@ -136,12 +136,11 @@ public abstract class AbstractOOXMLExtra
     private void handleThumbnail( ContentHandler handler ) {
         try {
             OPCPackage opcPackage = extractor.getPackage();
-            int thumbIndex = 0;
             for (PackageRelationship rel : opcPackage.getRelationshipsByType( PackageRelationshipTypes.THUMBNAIL )) {
                 PackagePart tPart = opcPackage.getPart(rel);
                 InputStream tStream = tPart.getInputStream();
                 Metadata thumbnailMetadata = new Metadata();                
-                String thumbName = "thumbnail_"  + thumbIndex + "." + tPart.getPartName().getExtension();
+                String thumbName = tPart.getPartName().getName();
                 thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
                 
                 AttributesImpl attributes = new AttributesImpl();
@@ -155,11 +154,10 @@ public abstract class AbstractOOXMLExtra
                 thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
                 
                 if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
-                    embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, true);
+                    embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false);
                 }
                 
                 tStream.close();
-                thumbIndex ++;
             }
          } catch (Exception ex) {
              

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1623819&r1=1623818&r2=1623819&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Tue Sep  9 13:01:21 2014
@@ -1056,47 +1056,37 @@ public class OOXMLParserTest extends Tik
      
     }
     
-    //TIKA-1223
     @Test
     public void testDOCXThumbnail() throws Exception {
         String xml = getXML("testDOCX_Thumbnail.docx").xml;
         int a = xml.indexOf("This file contains a thumbnail");
-        int b = xml.indexOf("<div class=\"embedded\" id=\"thumbnail_0.emf\" />");
-        int c = xml.indexOf( "<div class=\"package-entry\"><h1>thumbnail_0.emf</h1></div>" );
+        int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
         
         assertTrue(a != -1);
         assertTrue(b != -1);
-        assertTrue(c != -1);
         assertTrue(a < b);
-        assertTrue(b < c);
     }
     
     @Test
     public void testXLSXThumbnail() throws Exception {
         String xml = getXML("testXLSX_Thumbnail.xlsx").xml;
         int a = xml.indexOf("This file contains an embedded thumbnail by default");
-        int b = xml.indexOf("<div class=\"embedded\" id=\"thumbnail_0.wmf\" />");
-        int c = xml.indexOf( "<div class=\"package-entry\"><h1>thumbnail_0.wmf</h1></div>" );
+        int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.wmf\" />");
         
         assertTrue(a != -1);
         assertTrue(b != -1);
-        assertTrue(c != -1);
         assertTrue(a < b);
-        assertTrue(b < c);
     }
     
     @Test
     public void testPPTXThumbnail() throws Exception {
         String xml = getXML("testPPTX_Thumbnail.pptx").xml;
         int a = xml.indexOf("<body><p>This file contains an embedded thumbnail</p>");
-        int b = xml.indexOf("<div class=\"embedded\" id=\"thumbnail_0.jpeg\" />");
-        int c = xml.indexOf( "<div class=\"package-entry\"><h1>thumbnail_0.jpeg</h1></div>" );
+        int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.jpeg\" />");
         
         assertTrue(a != -1);
         assertTrue(b != -1);
-        assertTrue(c != -1);
         assertTrue(a < b);
-        assertTrue(b < c);
     }
 
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1623819&r1=1623818&r2=1623819&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Tue Sep  9 13:01:21 2014
@@ -442,7 +442,7 @@ public class RTFParserTest extends TikaT
         trueNames.add("file_3.pdf");
         trueNames.add("file_4.ppt");
         trueNames.add("file_5.pptx");
-        trueNames.add("thumbnail_0.jpeg");
+        trueNames.add("thumbnail.jpeg");
         trueNames.add("file_6.doc");
         trueNames.add("file_7.doc");
         trueNames.add("file_8.docx");