You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/04 19:26:48 UTC
svn commit: r692168 - in /incubator/tika/trunk/src/main/java/org/apache/tika:
parser/zip/ZipParser.java sax/BodyContentHandler.java
sax/xpath/MatchingContentHandler.java
Author: jukka
Date: Thu Sep 4 10:26:47 2008
New Revision: 692168
URL: http://svn.apache.org/viewvc?rev=692168&view=rev
Log:
TIKA-149: Parser for zip files
Include the structured XHTML <body/> content of the zip entries in the output document.
Needed to modify BodyContentHandler and MatchingContentHandler to make this work. I believe the modifications made both classes better and should cause no backwards compatibility issues.
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java Thu Sep 4 10:26:47 2008
@@ -87,9 +87,10 @@
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
- ContentHandler content = new BodyContentHandler();
- getParser().parse(new CloseShieldInputStream(stream), content, metadata);
- xhtml.element("p", content.toString());
+ getParser().parse(
+ new CloseShieldInputStream(stream),
+ new BodyContentHandler(xhtml),
+ metadata);
} catch (TikaException e) {
// Could not parse the entry, just skip the content
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Sep 4 10:26:47 2008
@@ -25,8 +25,9 @@
import org.xml.sax.ContentHandler;
/**
- * Content handler decorator that only passes the XHTML <body/>
- * tag and everything inside it to the underlying handler.
+ * Content handler decorator that only passes everything inside
+ * the XHTML <body/> tag to the underlying handler. Note that
+ * the <body/> tag itself is <em>not</em> passed on.
*/
public class BodyContentHandler extends ContentHandlerDecorator {
@@ -40,7 +41,7 @@
* The XPath matcher used to select the XHTML body contents.
*/
private static final Matcher MATCHER =
- PARSER.parse("/xhtml:html/xhtml:body//node()");
+ PARSER.parse("/xhtml:html/xhtml:body/*//node()");
/**
* Creates a content handler that passes all XHTML body events to the
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java?rev=692168&r1=692167&r2=692168&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java Thu Sep 4 10:26:47 2008
@@ -26,7 +26,9 @@
/**
* Content handler decorator that only passes the elements, attributes,
- * and text nodes that match the given XPath expression.
+ * and text nodes that match the given XPath expression. Note especially
+ * that {@link #startDocument()} and {@link #endDocument()} events are not
+ * passed to the decorated handler.
*/
public class MatchingContentHandler extends ContentHandlerDecorator {
@@ -100,4 +102,16 @@
}
}
+ /**
+ * Ignored.
+ */
+ public void startDocument() {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void endDocument() {
+ }
+
}