You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/04/10 12:52:12 UTC

svn commit: r646748 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/cli/ src/main/java/org/apache/tika/gui/ src/main/java/org/apache/tika/parser/ src/main/java/org/apache/tika/parser/html/ src/main/java/org/apache/tika/sax/ src/main/java/o...

Author: jukka
Date: Thu Apr 10 03:52:06 2008
New Revision: 646748

URL: http://svn.apache.org/viewvc?rev=646748&view=rev
Log:
TIKA-113: Metadata (such as title) should not be part of content
    - Added BodyContentHandler that only processes XHTML body events
    - Added utility constructors for WriteOutContentHandler and BodyContentHandler
    - Updated test cases and related code to use BodyContentHandler where appropriate
    - Removed AppendableAdaptor class as it's not used anymore

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
Removed:
    incubator/tika/trunk/src/main/java/org/apache/tika/sax/AppendableAdaptor.java
    incubator/tika/trunk/src/test/java/org/apache/tika/sax/AppendableAdaptorTest.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
    incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
    incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Apr 10 03:52:06 2008
@@ -43,6 +43,8 @@
 
 18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting)
 
+19. TIKA-113 - Metadata (such as title) should not be part of content
+               (Jukka Zitting)
 
 Release 0.1-incubating - 12/27/2007
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java Thu Apr 10 03:52:06 2008
@@ -18,11 +18,8 @@
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.IOException;
 import java.io.InputStream;
-import java.io.OutputStreamWriter;
 import java.io.PrintStream;
-import java.io.Writer;
 import java.net.URL;
 import java.util.Arrays;
 
@@ -41,12 +38,8 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
@@ -176,17 +169,7 @@
     }
 
     private ContentHandler getTextContentHandler() {
-        final Writer writer = new OutputStreamWriter(System.out);
-        XPathParser parser =
-            new XPathParser("xhtml", XHTMLContentHandler.XHTML);
-        return new MatchingContentHandler(
-                new WriteOutContentHandler(writer),
-                parser.parse("/xhtml:html/xhtml:body//text()")) {
-            public void endDocument() throws SAXException {
-                super.endDocument();
-                try { writer.flush(); } catch (IOException e) {}
-            }
-        };
+        return new BodyContentHandler(System.out);
     }
 
     private ContentHandler getMetadataContentHandler() {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Apr 10 03:52:06 2008
@@ -43,12 +43,10 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -256,11 +254,7 @@
     }
 
     private ContentHandler getTextContentHandler(Writer writer) {
-        XPathParser parser =
-            new XPathParser("xhtml", XHTMLContentHandler.XHTML);
-        return new MatchingContentHandler(
-                new WriteOutContentHandler(writer),
-                parser.parse("/xhtml:html/xhtml:body//text()"));
+        return new BodyContentHandler(writer);
     }
 
     private ContentHandler getXmlContentHandler(Writer writer)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java Thu Apr 10 03:52:06 2008
@@ -18,12 +18,11 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.utils.RegexUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -53,12 +52,10 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
-        StringWriter writer = new StringWriter();
-        handler = new TeeContentHandler(
-                handler, new WriteOutContentHandler(writer));
-        super.parse(stream, handler, metadata);
+        ContentHandler body = new BodyContentHandler();
+        super.parse(stream, new TeeContentHandler(handler, body), metadata);
 
-        String content = writer.toString();
+        String content = body.toString();
         metadata.set("fulltext", content);
 
         int length = Math.min(content.length(), 500);

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Thu Apr 10 03:52:06 2008
@@ -18,7 +18,6 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
@@ -109,11 +108,10 @@
     }
 
     private ContentHandler getTitleHandler(final Metadata metadata) {
-        final StringWriter writer = new StringWriter();
-        return new WriteOutContentHandler(writer) {
+        return new WriteOutContentHandler() {
             @Override
             public void endElement(String u, String l, String n) {
-                metadata.set(Metadata.TITLE, writer.toString());
+                metadata.set(Metadata.TITLE, toString());
             }
         };
     }

Added: incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=646748&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Apr 10 03:52:06 2008
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.OutputStream;
+import java.io.Writer;
+
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Content handler decorator that only passes the XHTML <body/>
+ * tag and everything inside it to the underlying handler.
+ */
+public class BodyContentHandler extends ContentHandlerDecorator {
+
+    /**
+     * XHTML XPath parser.
+     */
+    private static final XPathParser PARSER =
+        new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+
+    /**
+     * The XPath matcher used to select the XHTML body contents.
+     */
+    private static final Matcher MATCHER =
+        PARSER.parse("/xhtml:html/xhtml:body//node()");
+
+    /**
+     * Creates a content handler that passes all XHTML body events to the
+     * given underlying content handler.
+     *
+     * @param handler content handler
+     */
+    public BodyContentHandler(ContentHandler handler) {
+        super(new MatchingContentHandler(handler, MATCHER));
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
+    public BodyContentHandler(Writer writer) {
+        this(new WriteOutContentHandler(writer));
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given output stream using the default encoding.
+     *
+     * @param stream output stream
+     */
+    public BodyContentHandler(OutputStream stream) {
+        this(new WriteOutContentHandler(stream));
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * an internal string buffer. The contents of the buffer can be retrieved
+     * using the {@link #toString()} method.
+     */
+    public BodyContentHandler() {
+        this(new WriteOutContentHandler());
+    }
+
+}

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java Thu Apr 10 03:52:06 2008
@@ -94,4 +94,8 @@
         handler.skippedEntity(name);
     }
 
+    public String toString() {
+        return handler.toString();
+    }
+
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java Thu Apr 10 03:52:06 2008
@@ -17,6 +17,9 @@
 package org.apache.tika.sax;
 
 import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
 import java.io.Writer;
 
 import org.xml.sax.SAXException;
@@ -33,10 +36,39 @@
      */
     private final Writer writer;
 
+    /**
+     * Creates a content handler that writes character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
     public WriteOutContentHandler(Writer writer) {
         this.writer = writer;
     }
 
+    /**
+     * Creates a content handler that writes character events to
+     * the given output stream using the default encoding.
+     *
+     * @param stream output stream
+     */
+    public WriteOutContentHandler(OutputStream stream) {
+        this(new OutputStreamWriter(stream));
+    }
+
+    /**
+     * Creates a content handler that writes character events
+     * to an internal string buffer. Use the {@link #toString()}
+     * method to access the collected character content.
+     */
+    public WriteOutContentHandler() {
+        this(new StringWriter());
+    }
+
+    /**
+     * Writes the given characters to the given character stream.
+     */
+    @Override
     public void characters(char[] ch, int start, int length)
             throws SAXException {
         try {
@@ -44,6 +76,18 @@
         } catch (IOException e) {
             throw new SAXException("Error writing out character content", e);
         }
+    }
+
+    /**
+     * Returns the contents of the internal string buffer where
+     * all the received characters have been collected. Only works
+     * when this object was constructed using the empty default
+     * constructor or by passing a {@link StringWriter} to the
+     * other constructor.
+     */
+    @Override
+    public String toString() {
+        return writer.toString();
     }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java Thu Apr 10 03:52:06 2008
@@ -23,7 +23,6 @@
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
@@ -33,7 +32,8 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMimeKeys;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
@@ -169,10 +169,9 @@
             throws TikaException, IOException {
         try {
             Parser parser = config.getParser(mimeType);
-            StringWriter writer = new StringWriter();
-            parser.parse(
-                    stream, new WriteOutContentHandler(writer), new Metadata());
-            return writer.toString();
+            ContentHandler handler = new BodyContentHandler();
+            parser.parse(stream, handler, new Metadata());
+            return handler.toString();
         } catch (SAXException e) {
             throw new TikaException("Unexpected SAX error", e);
         }

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Thu Apr 10 03:52:06 2008
@@ -18,12 +18,11 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 
 import org.apache.commons.lang.builder.ReflectionToStringBuilder;
 import org.apache.commons.lang.builder.ToStringStyle;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
 import junit.framework.TestCase;
@@ -63,15 +62,14 @@
             Metadata metadata = new Metadata();
             metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
             metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
+            ContentHandler handler = new BodyContentHandler();
             new AutoDetectParser().parse(input, handler, metadata);
 
             assertEquals("Bad content type: " + tp,
                     tp.realType, metadata.get(Metadata.CONTENT_TYPE));
 
             assertTrue("Expected content not found: " + tp,
-                    writer.toString().contains(tp.expectedContentFragment));
+                    handler.toString().contains(tp.expectedContentFragment));
         } finally {
             input.close();
         }

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Apr 10 03:52:06 2008
@@ -26,9 +26,12 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
 import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 public class HtmlParserTest extends TestCase {
 
@@ -40,32 +43,32 @@
     }
 
     public void testParseAscii() throws Exception {
-        StringWriter writer = new StringWriter();
         final StringWriter href = new StringWriter();
-
+        
+        ContentHandler body = new BodyContentHandler();
+        ContentHandler link = new DefaultHandler() {
+            @Override
+            public void startElement(
+                    String u, String l, String n, Attributes a)
+                    throws SAXException {
+                if ("a".equals(l)) {
+                    href.append(a.getValue("href"));
+                }
+            }
+        };
         Metadata metadata = new Metadata();
-        parser.parse(
-                getStream("test-documents/testHTML.html"),
-                new WriteOutContentHandler(writer) {
-                    @Override
-                    public void startElement(
-                            String uri, String local, String name,
-                            Attributes attributes) {
-                        if ("a".equals(local)) {
-                            href.append(attributes.getValue("href"));
-                        }
-                    }
-                },
-                metadata);
+        InputStream stream = getStream("test-documents/testHTML.html");
+        try {
+            parser.parse(stream, new TeeContentHandler(body, link), metadata);
+        } finally {
+            stream.close();
+        }
 
         assertEquals(
                 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
         assertEquals("http://www.apache.org/", href.toString());
 
-        String content = writer.toString();
-        assertTrue(
-                "Did not contain expected text: Title : Test Indexation Html",
-                content.contains("Title : Test Indexation Html"));
+        String content = body.toString();
         assertTrue(
                 "Did not contain expected text:" + "Test Indexation Html",
                 content.contains("Test Indexation Html"));
@@ -76,13 +79,13 @@
     }
 
     public void XtestParseUTF8() throws IOException, SAXException, TikaException {
-
-        StringWriter writer = new StringWriter();
+        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
 
-        parser.parse(getStream("test-documents/testHTML_utf8.html"),
-                new WriteOutContentHandler(writer), metadata);
-        String content = writer.toString();
+        parser.parse(
+                getStream("test-documents/testHTML_utf8.html"),
+                handler, metadata);
+        String content = handler.toString();
 
         assertTrue("Did not contain expected text:"
                 + "Title : Tilte with UTF-8 chars öäå", content
@@ -100,8 +103,9 @@
     public void testParseEmpty() throws Exception {
         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
-        parser.parse(new ByteArrayInputStream(new byte[0]),
-                new WriteOutContentHandler(writer), metadata);
+        parser.parse(
+                new ByteArrayInputStream(new byte[0]),
+                new BodyContentHandler(writer), metadata);
         String content = writer.toString();
         assertEquals("", content);
     }

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,14 +17,13 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.InputStream;
-import java.io.StringWriter;
+
+import junit.framework.TestCase;
 
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
-import junit.framework.TestCase;
-
 public class ExcelParserTest extends TestCase {
 
     public void testExcelParser() throws Exception {
@@ -32,8 +31,7 @@
                 "/test-documents/testEXCEL.xls");
         try {
             Metadata metadata = new Metadata();
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
+            ContentHandler handler = new BodyContentHandler();
             new OfficeParser().parse(input, handler, metadata);
 
             assertEquals(
@@ -41,7 +39,7 @@
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            String content = writer.toString();
+            String content = handler.toString();
             assertTrue(content.contains("Sample Excel Worksheet"));
             assertTrue(content.contains("Numbers and their Squares"));
             assertTrue(content.contains("9"));

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,10 +17,9 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.InputStream;
-import java.io.StringWriter;
 
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
 import junit.framework.TestCase;
@@ -32,8 +31,7 @@
                 "/test-documents/testPPT.ppt");
         try {
             Metadata metadata = new Metadata();
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
+            ContentHandler handler = new BodyContentHandler();
             new OfficeParser().parse(input, handler, metadata);
 
             assertEquals(
@@ -41,7 +39,7 @@
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            String content = writer.toString();
+            String content = handler.toString();
             assertTrue(content.contains("Sample Powerpoint Slide"));
             assertTrue(content.contains("Powerpoint X for Mac"));
         } finally {

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,10 +17,9 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.InputStream;
-import java.io.StringWriter;
 
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
 import junit.framework.TestCase;
@@ -31,9 +30,8 @@
         InputStream input = WordParserTest.class.getResourceAsStream(
                 "/test-documents/testWORD.doc");
         try {
+            ContentHandler handler = new BodyContentHandler();
             Metadata metadata = new Metadata();
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
             new OfficeParser().parse(input, handler, metadata);
 
             assertEquals(
@@ -41,8 +39,7 @@
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            String content = writer.toString();
-            assertTrue(content.contains("Sample Word Document"));
+            assertTrue(handler.toString().contains("Sample Word Document"));
         } finally {
             input.close();
         }

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,12 +17,11 @@
 package org.apache.tika.parser.opendocument;
 
 import java.io.InputStream;
-import java.io.StringWriter;
 
 import junit.framework.TestCase;
 
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
 public class OpenOfficeParserTest extends TestCase {
@@ -32,8 +31,7 @@
                 "/test-documents/testOpenOffice2.odt");
         try {
             Metadata metadata = new Metadata();
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
+            ContentHandler handler = new BodyContentHandler();
             new OpenOfficeParser().parse(input, handler, metadata);
 
             assertEquals(
@@ -52,7 +50,7 @@
             assertEquals("14", metadata.get("nbWord"));
             assertEquals("78", metadata.get("nbCharacter"));
 
-            String content = writer.toString();
+            String content = handler.toString();
             assertTrue(content.contains(
                     "This is a sample Open Office document,"
                     + " written in NeoOffice 2.2.1 for the Mac."));

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Thu Apr 10 03:52:06 2008
@@ -21,7 +21,9 @@
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
 
 import junit.framework.TestCase;
 
@@ -59,30 +61,24 @@
     public void testUTF8Text() throws Exception {
         String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
 
+        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-        StringWriter writer = new StringWriter();
         parser.parse(
                 new ByteArrayInputStream(text.getBytes("UTF-8")),
-                new WriteOutContentHandler(writer),
-                metadata);
-        String content = writer.toString();
-
+                handler, metadata);
         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
 
-        assertTrue(content.contains(text));
+        assertTrue(handler.toString().contains(text));
     }
 
     public void testEmptyText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
-        StringWriter writer = new StringWriter();
         parser.parse(
-                new ByteArrayInputStream(new byte[0]),
-                new WriteOutContentHandler(writer),
-                metadata);
-        String content = writer.toString();
+                new ByteArrayInputStream(new byte[0]), handler, metadata);
         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("", content);
+        assertEquals("", handler.toString());
     }
 
 }

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,13 +17,13 @@
 package org.apache.tika.parser.xml;
 
 import java.io.InputStream;
-import java.io.StringWriter;
 
 import junit.framework.TestCase;
 
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 public class DcXMLParserTest extends TestCase {
 
@@ -32,8 +32,7 @@
                 "/test-documents/testXML.xml");
         try {
             Metadata metadata = new Metadata();
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
+            ContentHandler handler = new BodyContentHandler();
             new DcXMLParser().parse(input, handler, metadata);
 
             assertEquals(
@@ -55,7 +54,7 @@
             assertEquals("Fr", metadata.get(Metadata.LANGUAGE));
             assertTrue(metadata.get(Metadata.RIGHTS).contains("testing chars"));
 
-            String content = writer.toString();
+            String content = handler.toString();
             assertTrue(content.contains("Tika test document"));
         } finally {
             input.close();
@@ -66,9 +65,7 @@
         InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml");
         try {
             Metadata metadata = new Metadata();
-            StringWriter writer = new StringWriter();
-            ContentHandler handler = new WriteOutContentHandler(writer);
-            new DcXMLParser().parse(input, handler, metadata);
+            new DcXMLParser().parse(input, new DefaultHandler(), metadata);
             
             final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
             assertEquals(expected,metadata.get(Metadata.RIGHTS));