You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2014/09/09 15:01:21 UTC
svn commit: r1623819 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/ooxml/
test/java/org/apache/tika/parser/microsoft/ooxml/
test/java/org/apache/tika/parser/rtf/
Author: thaichat04
Date: Tue Sep 9 13:01:21 2014
New Revision: 1623819
URL: http://svn.apache.org/r1623819
Log:
TIKA-1413 - Remove embedded thumbnail from body
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1623819&r1=1623818&r2=1623819&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Tue Sep 9 13:01:21 2014
@@ -136,12 +136,11 @@ public abstract class AbstractOOXMLExtra
private void handleThumbnail( ContentHandler handler ) {
try {
OPCPackage opcPackage = extractor.getPackage();
- int thumbIndex = 0;
for (PackageRelationship rel : opcPackage.getRelationshipsByType( PackageRelationshipTypes.THUMBNAIL )) {
PackagePart tPart = opcPackage.getPart(rel);
InputStream tStream = tPart.getInputStream();
Metadata thumbnailMetadata = new Metadata();
- String thumbName = "thumbnail_" + thumbIndex + "." + tPart.getPartName().getExtension();
+ String thumbName = tPart.getPartName().getName();
thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
AttributesImpl attributes = new AttributesImpl();
@@ -155,11 +154,10 @@ public abstract class AbstractOOXMLExtra
thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
- embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, true);
+ embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false);
}
tStream.close();
- thumbIndex ++;
}
} catch (Exception ex) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1623819&r1=1623818&r2=1623819&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Tue Sep 9 13:01:21 2014
@@ -1056,47 +1056,37 @@ public class OOXMLParserTest extends Tik
}
- //TIKA-1223
@Test
public void testDOCXThumbnail() throws Exception {
String xml = getXML("testDOCX_Thumbnail.docx").xml;
int a = xml.indexOf("This file contains a thumbnail");
- int b = xml.indexOf("<div class=\"embedded\" id=\"thumbnail_0.emf\" />");
- int c = xml.indexOf( "<div class=\"package-entry\"><h1>thumbnail_0.emf</h1></div>" );
+ int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
assertTrue(a != -1);
assertTrue(b != -1);
- assertTrue(c != -1);
assertTrue(a < b);
- assertTrue(b < c);
}
@Test
public void testXLSXThumbnail() throws Exception {
String xml = getXML("testXLSX_Thumbnail.xlsx").xml;
int a = xml.indexOf("This file contains an embedded thumbnail by default");
- int b = xml.indexOf("<div class=\"embedded\" id=\"thumbnail_0.wmf\" />");
- int c = xml.indexOf( "<div class=\"package-entry\"><h1>thumbnail_0.wmf</h1></div>" );
+ int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.wmf\" />");
assertTrue(a != -1);
assertTrue(b != -1);
- assertTrue(c != -1);
assertTrue(a < b);
- assertTrue(b < c);
}
@Test
public void testPPTXThumbnail() throws Exception {
String xml = getXML("testPPTX_Thumbnail.pptx").xml;
int a = xml.indexOf("<body><p>This file contains an embedded thumbnail</p>");
- int b = xml.indexOf("<div class=\"embedded\" id=\"thumbnail_0.jpeg\" />");
- int c = xml.indexOf( "<div class=\"package-entry\"><h1>thumbnail_0.jpeg</h1></div>" );
+ int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.jpeg\" />");
assertTrue(a != -1);
assertTrue(b != -1);
- assertTrue(c != -1);
assertTrue(a < b);
- assertTrue(b < c);
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1623819&r1=1623818&r2=1623819&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Tue Sep 9 13:01:21 2014
@@ -442,7 +442,7 @@ public class RTFParserTest extends TikaT
trueNames.add("file_3.pdf");
trueNames.add("file_4.ppt");
trueNames.add("file_5.pptx");
- trueNames.add("thumbnail_0.jpeg");
+ trueNames.add("thumbnail.jpeg");
trueNames.add("file_6.doc");
trueNames.add("file_7.doc");
trueNames.add("file_8.docx");