You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/04 19:26:48 UTC

svn commit: r692168 - in /incubator/tika/trunk/src/main/java/org/apache/tika: parser/zip/ZipParser.java sax/BodyContentHandler.java sax/xpath/MatchingContentHandler.java

Author: jukka
Date: Thu Sep  4 10:26:47 2008
New Revision: 692168

URL: http://svn.apache.org/viewvc?rev=692168&view=rev
Log:
TIKA-149: Parser for zip files 

Include the structured XHTML <body/> content of the zip entries in the output document.

Needed to modify BodyContentHandler and MatchingContentHandler to make this work. I believe the modifications made both classes better and should cause no backwards compatibility issues.

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
    incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java Thu Sep  4 10:26:47 2008
@@ -87,9 +87,10 @@
         try {
             Metadata metadata = new Metadata();
             metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-            ContentHandler content = new BodyContentHandler();
-            getParser().parse(new CloseShieldInputStream(stream), content, metadata);
-            xhtml.element("p", content.toString());
+            getParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new BodyContentHandler(xhtml),
+                    metadata);
         } catch (TikaException e) {
             // Could not parse the entry, just skip the content
         }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Sep  4 10:26:47 2008
@@ -25,8 +25,9 @@
 import org.xml.sax.ContentHandler;
 
 /**
- * Content handler decorator that only passes the XHTML &lt;body/&gt;
- * tag and everything inside it to the underlying handler.
+ * Content handler decorator that only passes everything inside
+ * the XHTML &lt;body/&gt; tag to the underlying handler. Note that
+ * the &lt;body/&gt; tag itself is <em>not</em> passed on.
  */
 public class BodyContentHandler extends ContentHandlerDecorator {
 
@@ -40,7 +41,7 @@
      * The XPath matcher used to select the XHTML body contents.
      */
     private static final Matcher MATCHER =
-        PARSER.parse("/xhtml:html/xhtml:body//node()");
+        PARSER.parse("/xhtml:html/xhtml:body/*//node()");
 
     /**
      * Creates a content handler that passes all XHTML body events to the

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java Thu Sep  4 10:26:47 2008
@@ -26,7 +26,9 @@
 
 /**
  * Content handler decorator that only passes the elements, attributes,
- * and text nodes that match the given XPath expression.
+ * and text nodes that match the given XPath expression. Note especially
+ * that {@link #startDocument()} and {@link #endDocument()} events are not
+ * passed to the decorated handler.
  */
 public class MatchingContentHandler extends ContentHandlerDecorator {
 
@@ -100,4 +102,16 @@
         }
     }
 
+    /**
+     * Ignored.
+     */
+    public void startDocument() {
+    }
+
+    /**
+     * Ignored.
+     */
+    public void endDocument() {
+    }
+
 }