You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/11/18 17:10:24 UTC
svn commit: r1410914 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/p...
Author: mikemccand
Date: Sun Nov 18 16:10:23 2012
New Revision: 1410914
URL: http://svn.apache.org/viewvc?rev=1410914&view=rev
Log:
TIKA-1025: leave placeholder where embedded docs appear in .ppt extraction
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Nov 18 16:10:23 2012
@@ -39,6 +39,11 @@ Release 1.3 - Current Development
key, and TikaCLI prepends the rId (if present) onto the filename
it extracts (TIKA-997).
+ * MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
+ embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
+ XHTML so you can see where in the main text the embedded document
+ occurred (TIKA-1025).
+
* MHTML: fixed Null charset name exception when a mime part has an
unrecognized charset (TIKA-1011).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Sun Nov 18 16:10:23 2012
@@ -16,6 +16,9 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.IOException;
+import java.util.HashSet;
+
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.*;
import org.apache.poi.hslf.usermodel.ObjectData;
@@ -28,9 +31,7 @@ import org.apache.tika.io.TikaInputStrea
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.util.HashSet;
+import org.xml.sax.helpers.AttributesImpl;
public class HSLFExtractor extends AbstractPOIFSExtractor {
public HSLFExtractor(ParseContext context) {
@@ -221,27 +222,39 @@ public class HSLFExtractor extends Abstr
for( Shape shape : shapes ) {
if( shape instanceof OLEShape ) {
OLEShape oleShape = (OLEShape)shape;
-
+ ObjectData data = null;
try {
- ObjectData data = oleShape.getObjectData();
-
- if(data != null) {
- TikaInputStream stream =
- TikaInputStream.get(data.getData());
- try {
- String mediaType = null;
- if ("Excel.Chart.8".equals(oleShape.getProgID())) {
- mediaType = "application/vnd.ms-excel";
- }
- handleEmbeddedResource(
- stream, Integer.toString(oleShape.getObjectID()), null,
- mediaType, xhtml, false);
- } finally {
- stream.close();
+ data = oleShape.getObjectData();
+ } catch( NullPointerException e ) {
+ /* getObjectData throws NPE some times. */
+ }
+
+ if (data != null) {
+ String objID = Integer.toString(oleShape.getObjectID());
+
+ // Embedded Object: add a <div
+ // class="embedded" id="X"/> so consumer can see where
+ // in the main text each embedded document
+ // occurred:
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", objID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ TikaInputStream stream =
+ TikaInputStream.get(data.getData());
+ try {
+ String mediaType = null;
+ if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+ mediaType = "application/vnd.ms-excel";
}
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
+ } finally {
+ stream.close();
}
- } catch( NullPointerException e ) {
- /* getObjectData throws NPE some times. */
}
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Sun Nov 18 16:10:23 2012
@@ -18,11 +18,21 @@ package org.apache.tika;
import java.io.File;
import java.io.InputStream;
+import java.io.StringWriter;
import java.net.URISyntaxException;
import java.net.URL;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
import junit.framework.TestCase;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
/**
* Parent class of Tika tests
*/
@@ -64,4 +74,41 @@ public abstract class TikaTest extends T
public void assertContains(String needle, String haystack) {
assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
}
+
+ protected static class XMLResult {
+ public final String xml;
+ public final Metadata metadata;
+
+ public XMLResult(String xml, Metadata metadata) {
+ this.xml = xml;
+ this.metadata = metadata;
+ }
+ }
+
+ protected XMLResult getXML(String filePath) throws Exception {
+ InputStream input = null;
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.setResult(new StreamResult(sw));
+
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+
+ // Try with a document containing various tables and formattings
+ input = getResourceAsStream("/test-documents/" + filePath);
+ try {
+ parser.parse(input, handler, metadata, context);
+ return new XMLResult(sw.toString(), metadata);
+ } finally {
+ input.close();
+ }
+ }
+
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Sun Nov 18 16:10:23 2012
@@ -220,4 +220,11 @@ public class PowerPointParserTest extend
assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
+
+ // TIKA-1025
+ public void testEmbeddedPlacedholder() throws Exception {
+ XMLResult result = getXML("testPPT_embedded2.ppt");
+ assertContains("<div class=\"embedded\" id=\"1\"/>", result.xml);
+ assertContains("<div class=\"embedded\" id=\"14\"/>", result.xml);
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Sun Nov 18 16:10:23 2012
@@ -17,14 +17,8 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
-import java.io.StringWriter;
import java.util.Locale;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -32,7 +26,6 @@ import org.apache.tika.metadata.OfficeOp
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -72,38 +65,6 @@ public class WordParserTest extends Tika
}
}
- private static class XMLResult {
- public final String xml;
- public final Metadata metadata;
-
- public XMLResult(String xml, Metadata metadata) {
- this.xml = xml;
- this.metadata = metadata;
- }
- }
-
- private XMLResult getXML(String filePath) throws Exception {
- InputStream input = null;
- Metadata metadata = new Metadata();
-
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.setResult(new StreamResult(sw));
-
- // Try with a document containing various tables and formattings
- input = OOXMLParserTest.class.getResourceAsStream(filePath);
- try {
- new OfficeParser().parse(input, handler, metadata, new ParseContext());
- return new XMLResult(sw.toString(), metadata);
- } finally {
- input.close();
- }
- }
-
/**
* Test that the word converter is able to generate the
* correct HTML for the document
@@ -112,7 +73,7 @@ public class WordParserTest extends Tika
// Try with a document containing various tables and
// formattings
- XMLResult result = getXML("/test-documents/testWORD.doc");
+ XMLResult result = getXML("testWORD.doc");
String xml = result.xml;
Metadata metadata = result.metadata;
@@ -142,7 +103,7 @@ public class WordParserTest extends Tika
assertTrue(xml.contains("<p class=\"signature\">This one"));
// Try with a document that contains images
- xml = getXML("/test-documents/testWORD_3imgs.doc").xml;
+ xml = getXML("testWORD_3imgs.doc").xml;
// Images 1-3
assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
@@ -154,7 +115,7 @@ public class WordParserTest extends Tika
// TIKA-692: test document containing multiple
// character runs within a bold tag:
- xml = getXML("/test-documents/testWORD_bold_character_runs.doc").xml;
+ xml = getXML("testWORD_bold_character_runs.doc").xml;
// Make sure bold text arrived as single
// contiguous string even though Word parser
@@ -163,7 +124,7 @@ public class WordParserTest extends Tika
// TIKA-692: test document containing multiple
// character runs within a bold tag:
- xml = getXML("/test-documents/testWORD_bold_character_runs2.doc").xml;
+ xml = getXML("testWORD_bold_character_runs2.doc").xml;
// Make sure bold text arrived as single
// contiguous string even though Word parser
@@ -172,7 +133,7 @@ public class WordParserTest extends Tika
}
public void testEmbeddedNames() throws Exception {
- String result = getXML("/test-documents/testWORD_embedded_pdf.doc").xml;
+ String result = getXML("testWORD_embedded_pdf.doc").xml;
// Make sure the embedded div comes out after "Here
// is the pdf file" and before "Bye Bye":
@@ -189,14 +150,14 @@ public class WordParserTest extends Tika
// TIKA-982
public void testEmbeddedRTF() throws Exception {
- String result = getXML("/test-documents/testWORD_embedded_rtf.doc").xml;
+ String result = getXML("testWORD_embedded_rtf.doc").xml;
assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1404039792\"/>") != -1);
assertTrue(result.indexOf("_1404039792.rtf") != -1);
}
// TIKA-1019
public void testDocumentLink() throws Exception {
- String result = getXML("/test-documents/testDocumentLink.doc").xml;
+ String result = getXML("testDocumentLink.doc").xml;
assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\"/>") != -1);
assertTrue(result.indexOf("_1327495610.unknown") != -1);
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sun Nov 18 16:10:23 2012
@@ -308,36 +308,6 @@ public class OOXMLParserTest extends Tik
}
}
- private static class XMLResult {
- public final String xml;
- public final Metadata metadata;
-
- public XMLResult(String xml, Metadata metadata) {
- this.xml = xml;
- this.metadata = metadata;
- }
- }
-
- private XMLResult getXML(String name) throws Exception {
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- // Try with a document containing various tables and formattings
- InputStream input = getTestDocument(name);
- try {
- Metadata metadata = new Metadata();
- parser.parse(input, handler, metadata, new ParseContext());
- return new XMLResult(sw.toString(), metadata);
- } finally {
- input.close();
- }
- }
-
/**
* Test that the word converter is able to generate the
* correct HTML for the document
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sun Nov 18 16:10:23 2012
@@ -451,37 +451,4 @@ public class PDFParserTest extends TikaT
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
}
-
- private static class XMLResult {
- public final String xml;
- public final Metadata metadata;
-
- public XMLResult(String xml, Metadata metadata) {
- this.xml = xml;
- this.metadata = metadata;
- }
- }
-
- private XMLResult getXML(String filename) throws Exception {
- Metadata metadata = new Metadata();
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.setResult(new StreamResult(sw));
-
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
- // Try with a document containing various tables and formattings
- InputStream input = getResourceAsStream("/test-documents/" + filename);
- try {
- parser.parse(input, handler, metadata, context);
- return new XMLResult(sw.toString(), metadata);
- } finally {
- input.close();
- }
- }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sun Nov 18 16:10:23 2012
@@ -343,37 +343,6 @@ public class RTFParserTest extends TikaT
return new Result(content, metadata);
}
- private static class XMLResult {
- public final String xml;
- public final Metadata metadata;
-
- public XMLResult(String xml, Metadata metadata) {
- this.xml = xml;
- this.metadata = metadata;
- }
- }
-
- private XMLResult getXML(String filename) throws Exception {
- Metadata metadata = new Metadata();
-
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
- handler.setResult(new StreamResult(sw));
-
- // Try with a document containing various tables and formattings
- InputStream input = getResourceAsStream("/test-documents/" + filename);
- try {
- tika.getParser().parse(input, handler, metadata, new ParseContext());
- return new XMLResult(sw.toString(), metadata);
- } finally {
- input.close();
- }
- }
-
private String getText(String filename) throws Exception {
return getResult(filename).text;
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt?rev=1410914&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream