You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/28 20:03:26 UTC

svn commit: r1190478 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/odf/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/parser/odf/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Fri Oct 28 18:03:26 2011
New Revision: 1190478

URL: http://svn.apache.org/viewvc?rev=1190478&view=rev
Log:
TIKA-736: extract header/footer text for OpenOffice docs

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Oct 28 18:03:26 2011
@@ -45,6 +45,9 @@ The most notable changes in Tika 1.0 ove
  * Java: Tika no longer ships retrotranslated Java 1.4 binaries along
    with the normal ones that work with Java 5 and higher. (TIKA-744)
 
+ * OpenOffice documents: header/footer text is now extracted for text,
+   presentation and spreadsheet documents (TIKA-736)
+
 Tika 1.0 relies on the following set of major dependencies (generated using
 mvn dependency:tree from tika-parsers):
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java Fri Oct 28 18:03:26 2011
@@ -46,6 +46,7 @@ import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.helpers.AttributesImpl;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
@@ -75,6 +76,8 @@ public class OpenDocumentContentParser e
 
     protected static final char[] TAB = new char[] { '\t' };
 
+    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
     /**
      * Mappings between ODF tag names and XHTML tag names
      * (including attributes). All other tag names/attributes are ignored
@@ -174,9 +177,17 @@ public class OpenDocumentContentParser e
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        final XHTMLContentHandler xhtml =
-            new XHTMLContentHandler(handler,metadata);
-        DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {
+        parseInternal(stream,
+                      new XHTMLContentHandler(handler,metadata),
+                      metadata, context);
+    }
+
+    void parseInternal(
+            InputStream stream, final ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        DefaultHandler dh = new ElementMappingContentHandler(handler, MAPPINGS) {
 
             private final BitSet textNodeStack = new BitSet();
 
@@ -231,7 +242,7 @@ public class OpenDocumentContentParser e
              * Check if a node is a text node
              */
             private boolean isTextNode(String namespaceURI, String localName) {
-                if (TEXT_NS.equals(namespaceURI)) {
+                if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
                     return true;
                 }
                 if (SVG_NS.equals(namespaceURI)) {
@@ -263,10 +274,10 @@ public class OpenDocumentContentParser e
                 // call next handler if no filtering
                 if (completelyFiltered == 0) {
                     // special handling of text:h, that are directly passed
-                    // to xhtml handler
+                    // to incoming handler
                     if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
-                        xhtml.startElement(headingStack.push(
-                                getXHTMLHeaderTagName(atts)));
+                        final String el = headingStack.push(getXHTMLHeaderTagName(atts));
+                        handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
                     } else {
                         super.startElement(
                                 namespaceURI, localName, qName, atts);
@@ -281,9 +292,10 @@ public class OpenDocumentContentParser e
                 // call next handler if no filtering
                 if (completelyFiltered == 0) {
                     // special handling of text:h, that are directly passed
-                    // to xhtml handler
+                    // to incoming handler
                     if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
-                        xhtml.endElement(headingStack.pop());
+                        final String el = headingStack.pop();
+                        handler.endElement(XHTMLContentHandler.XHTML, el, el);
                     } else {
                         super.endElement(namespaceURI,localName,qName);
                     }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java Fri Oct 28 18:03:26 2011
@@ -25,6 +25,8 @@ import java.util.Set;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
+//import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+//import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
@@ -33,6 +35,7 @@ import org.apache.tika.parser.AbstractPa
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -106,12 +109,37 @@ public class OpenDocumentParser extends 
             InputStream stream, ContentHandler baseHandler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-       
+
+        // TODO: reuse the already opened ZIPFile, if
+        // present
+
+        /*
+        ZipFile zipFile;
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = ((TikaInputStream) stream).getOpenContainer();
+            if (container instanceof ZipFile) {
+                zipFile = (ZipFile) container;
+            } else if (tis.hasFile()) {
+                zipFile = new ZipFile(tis.getFile());                
+            }
+        }
+        */
+
+        // TODO: if incoming IS is a TIS with a file
+        // associated, we should open ZipFile so we can
+        // visit metadata, mimetype first; today we lose
+        // all the metadata if meta.xml is hit after
+        // content.xml in the stream.  Then we can still
+        // read-once for the content.xml.
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
         // As we don't know which of the metadata or the content
         //  we'll hit first, catch the endDocument call initially
         EndDocumentShieldingContentHandler handler = 
-          new EndDocumentShieldingContentHandler(baseHandler);
-       
+          new EndDocumentShieldingContentHandler(xhtml);
+
         // Process the file in turn
         ZipInputStream zip = new ZipInputStream(stream);
         ZipEntry entry = zip.getNextEntry();
@@ -122,7 +150,19 @@ public class OpenDocumentParser extends 
             } else if (entry.getName().equals("meta.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith("content.xml")) {
-                content.parse(zip, handler, metadata, context);
+                if (content instanceof OpenDocumentContentParser) {
+                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+                } else {
+                    // Foreign content parser was set:
+                    content.parse(zip, handler, metadata, context);
+                }
+            } else if (entry.getName().endsWith("styles.xml")) {
+                if (content instanceof OpenDocumentContentParser) {
+                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+                } else {
+                    // Foreign content parser was set:
+                    content.parse(zip, handler, metadata, context);
+                }
             }
             entry = zip.getNextEntry();
         }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java Fri Oct 28 18:03:26 2011
@@ -82,6 +82,10 @@ public class ZipContainerDetector implem
                         return MediaType.application("java-archive");
                     }
                 } finally {
+                    // TODO: shouldn't we record the open
+                    // container so it can be later
+                    // reused...?
+                    // tis.setOpenContainer(zip);
                     zip.close();
                 }
             } catch (IOException ignore) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1190478&r1=1190477&r2=1190478&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Fri Oct 28 18:03:26 2011
@@ -18,16 +18,16 @@ package org.apache.tika.parser.odf;
 
 import java.io.InputStream;
 
-import junit.framework.TestCase;
-
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.opendocument.OpenOfficeParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
-public class ODFParserTest extends TestCase {
+public class ODFParserTest extends TikaTest {
     /**
      * For now, allow us to run some tests against both
      *  the old and the new parser
@@ -207,4 +207,51 @@ public class ODFParserTest extends TestC
           input.close();
       }
    }
-}
\ No newline at end of file
+
+    public void testODPMasterFooter() throws Exception {
+        InputStream input = ODFParserTest.class.getResourceAsStream(
+            "/test-documents/testMasterFooter.odp");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new AutoDetectParser().parse(input, handler, metadata);
+  
+            String content = handler.toString();
+            assertContains("Master footer is here", content);
+        } finally {
+            input.close();
+        }
+    }  
+
+    public void testODTFooter() throws Exception {
+        InputStream input = ODFParserTest.class.getResourceAsStream(
+            "/test-documents/testFooter.odt");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new AutoDetectParser().parse(input, handler, metadata);
+  
+            String content = handler.toString();
+            assertContains("Here is some text...", content);
+            assertContains("Here is some text on page 2", content);
+            assertContains("Here is footer text", content);
+        } finally {
+            input.close();
+        }
+    }  
+
+    public void testODSFooter() throws Exception {
+        InputStream input = ODFParserTest.class.getResourceAsStream(
+            "/test-documents/testFooter.ods");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new AutoDetectParser().parse(input, handler, metadata);
+  
+            String content = handler.toString();
+            assertContains("Here is a footer in the center area", content);
+        } finally {
+            input.close();
+        }
+    }  
+}

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods?rev=1190478&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.ods
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt?rev=1190478&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFooter.odt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp?rev=1190478&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMasterFooter.odp
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream