You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/28 20:03:26 UTC
svn commit: r1190478 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/odf/
tika-parsers/src/main/java/org/apache/tika/parser/pkg/
tika-parsers/src/test/java/org/apache/tika/parser/odf/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Fri Oct 28 18:03:26 2011
New Revision: 1190478
URL: http://svn.apache.org/viewvc?rev=1190478&view=rev
Log:
TIKA-736: extract header/footer text for OpenOffice docs
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Oct 28 18:03:26 2011
@@ -45,6 +45,9 @@ The most notable changes in Tika 1.0 ove
* Java: Tika no longer ships retrotranslated Java 1.4 binaries along
with the normal ones that work with Java 5 and higher. (TIKA-744)
+ * OpenOffice documents: header/footer text is now extracted for text,
+ presentation and spreadsheet documents (TIKA-736)
+
Tika 1.0 relies on the following set of major dependencies (generated using
mvn dependency:tree from tika-parsers):
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java Fri Oct 28 18:03:26 2011
@@ -46,6 +46,7 @@ import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
/**
@@ -75,6 +76,8 @@ public class OpenDocumentContentParser e
protected static final char[] TAB = new char[] { '\t' };
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
@@ -174,9 +177,17 @@ public class OpenDocumentContentParser e
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- final XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler,metadata);
- DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {
+ parseInternal(stream,
+ new XHTMLContentHandler(handler,metadata),
+ metadata, context);
+ }
+
+ void parseInternal(
+ InputStream stream, final ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ DefaultHandler dh = new ElementMappingContentHandler(handler, MAPPINGS) {
private final BitSet textNodeStack = new BitSet();
@@ -231,7 +242,7 @@ public class OpenDocumentContentParser e
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI)) {
+ if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
@@ -263,10 +274,10 @@ public class OpenDocumentContentParser e
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
- // to xhtml handler
+ // to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- xhtml.startElement(headingStack.push(
- getXHTMLHeaderTagName(atts)));
+ final String el = headingStack.push(getXHTMLHeaderTagName(atts));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else {
super.startElement(
namespaceURI, localName, qName, atts);
@@ -281,9 +292,10 @@ public class OpenDocumentContentParser e
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
- // to xhtml handler
+ // to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- xhtml.endElement(headingStack.pop());
+ final String el = headingStack.pop();
+ handler.endElement(XHTMLContentHandler.XHTML, el, el);
} else {
super.endElement(namespaceURI,localName,qName);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java Fri Oct 28 18:03:26 2011
@@ -25,6 +25,8 @@ import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+//import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+//import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
@@ -33,6 +35,7 @@ import org.apache.tika.parser.AbstractPa
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -106,12 +109,37 @@ public class OpenDocumentParser extends
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
-
+
+ // TODO: reuse the already opened ZIPFile, if
+ // present
+
+ /*
+ ZipFile zipFile;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ }
+ }
+ */
+
+ // TODO: if incoming IS is a TIS with a file
+ // associated, we should open ZipFile so we can
+ // visit metadata, mimetype first; today we lose
+ // all the metadata if meta.xml is hit after
+ // content.xml in the stream. Then we can still
+ // read-once for the content.xml.
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler =
- new EndDocumentShieldingContentHandler(baseHandler);
-
+ new EndDocumentShieldingContentHandler(xhtml);
+
// Process the file in turn
ZipInputStream zip = new ZipInputStream(stream);
ZipEntry entry = zip.getNextEntry();
@@ -122,7 +150,19 @@ public class OpenDocumentParser extends
} else if (entry.getName().equals("meta.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
- content.parse(zip, handler, metadata, context);
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else if (entry.getName().endsWith("styles.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
}
entry = zip.getNextEntry();
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Fri Oct 28 18:03:26 2011
@@ -82,6 +82,10 @@ public class ZipContainerDetector implem
return MediaType.application("java-archive");
}
} finally {
+ // TODO: shouldn't we record the open
+ // container so it can be later
+ // reused...?
+ // tis.setOpenContainer(zip);
zip.close();
}
} catch (IOException ignore) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Fri Oct 28 18:03:26 2011
@@ -18,16 +18,16 @@ package org.apache.tika.parser.odf;
import java.io.InputStream;
-import junit.framework.TestCase;
-
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.opendocument.OpenOfficeParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
-public class ODFParserTest extends TestCase {
+public class ODFParserTest extends TikaTest {
/**
* For now, allow us to run some tests against both
* the old and the new parser
@@ -207,4 +207,51 @@ public class ODFParserTest extends TestC
input.close();
}
}
-}
\ No newline at end of file
+
+ public void testODPMasterFooter() throws Exception {
+ InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testMasterFooter.odp");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser().parse(input, handler, metadata);
+
+ String content = handler.toString();
+ assertContains("Master footer is here", content);
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testODTFooter() throws Exception {
+ InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testFooter.odt");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser().parse(input, handler, metadata);
+
+ String content = handler.toString();
+ assertContains("Here is some text...", content);
+ assertContains("Here is some text on page 2", content);
+ assertContains("Here is footer text", content);
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testODSFooter() throws Exception {
+ InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testFooter.ods");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser().parse(input, handler, metadata);
+
+ String content = handler.toString();
+ assertContains("Here is a footer in the center area", content);
+ } finally {
+ input.close();
+ }
+ }
+}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods?rev=1190478&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt?rev=1190478&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp?rev=1190478&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream