You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/04/10 12:52:12 UTC
svn commit: r646748 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/cli/ src/main/java/org/apache/tika/gui/
src/main/java/org/apache/tika/parser/
src/main/java/org/apache/tika/parser/html/
src/main/java/org/apache/tika/sax/ src/main/java/o...
Author: jukka
Date: Thu Apr 10 03:52:06 2008
New Revision: 646748
URL: http://svn.apache.org/viewvc?rev=646748&view=rev
Log:
TIKA-113: Metadata (such as title) should not be part of content
- Added BodyContentHandler that only processes XHTML body events
- Added utility constructors for WriteOutContentHandler and BodyContentHandler
- Updated test cases and related code to use BodyContentHandler where appropriate
- Removed AppendableAdaptor class as it's not used anymore
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/sax/AppendableAdaptor.java
incubator/tika/trunk/src/test/java/org/apache/tika/sax/AppendableAdaptorTest.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Apr 10 03:52:06 2008
@@ -43,6 +43,8 @@
18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting)
+19. TIKA-113 - Metadata (such as title) should not be part of content
+ (Jukka Zitting)
Release 0.1-incubating - 12/27/2007
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/cli/TikaCLI.java Thu Apr 10 03:52:06 2008
@@ -18,11 +18,8 @@
import java.io.File;
import java.io.FileInputStream;
-import java.io.IOException;
import java.io.InputStream;
-import java.io.OutputStreamWriter;
import java.io.PrintStream;
-import java.io.Writer;
import java.net.URL;
import java.util.Arrays;
@@ -41,12 +38,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
@@ -176,17 +169,7 @@
}
private ContentHandler getTextContentHandler() {
- final Writer writer = new OutputStreamWriter(System.out);
- XPathParser parser =
- new XPathParser("xhtml", XHTMLContentHandler.XHTML);
- return new MatchingContentHandler(
- new WriteOutContentHandler(writer),
- parser.parse("/xhtml:html/xhtml:body//text()")) {
- public void endDocument() throws SAXException {
- super.endDocument();
- try { writer.flush(); } catch (IOException e) {}
- }
- };
+ return new BodyContentHandler(System.out);
}
private ContentHandler getMetadataContentHandler() {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Apr 10 03:52:06 2008
@@ -43,12 +43,10 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -256,11 +254,7 @@
}
private ContentHandler getTextContentHandler(Writer writer) {
- XPathParser parser =
- new XPathParser("xhtml", XHTMLContentHandler.XHTML);
- return new MatchingContentHandler(
- new WriteOutContentHandler(writer),
- parser.parse("/xhtml:html/xhtml:body//text()"));
+ return new BodyContentHandler(writer);
}
private ContentHandler getXmlContentHandler(Writer writer)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java Thu Apr 10 03:52:06 2008
@@ -18,12 +18,11 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringWriter;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.utils.RegexUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -53,12 +52,10 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
- StringWriter writer = new StringWriter();
- handler = new TeeContentHandler(
- handler, new WriteOutContentHandler(writer));
- super.parse(stream, handler, metadata);
+ ContentHandler body = new BodyContentHandler();
+ super.parse(stream, new TeeContentHandler(handler, body), metadata);
- String content = writer.toString();
+ String content = body.toString();
metadata.set("fulltext", content);
int length = Math.min(content.length(), 500);
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Thu Apr 10 03:52:06 2008
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@@ -109,11 +108,10 @@
}
private ContentHandler getTitleHandler(final Metadata metadata) {
- final StringWriter writer = new StringWriter();
- return new WriteOutContentHandler(writer) {
+ return new WriteOutContentHandler() {
@Override
public void endElement(String u, String l, String n) {
- metadata.set(Metadata.TITLE, writer.toString());
+ metadata.set(Metadata.TITLE, toString());
}
};
}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=646748&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Apr 10 03:52:06 2008
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.OutputStream;
+import java.io.Writer;
+
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Content handler decorator that only passes the XHTML <body/>
+ * tag and everything inside it to the underlying handler.
+ */
+public class BodyContentHandler extends ContentHandlerDecorator {
+
+ /**
+ * XHTML XPath parser.
+ */
+ private static final XPathParser PARSER =
+ new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+
+ /**
+ * The XPath matcher used to select the XHTML body contents.
+ */
+ private static final Matcher MATCHER =
+ PARSER.parse("/xhtml:html/xhtml:body//node()");
+
+ /**
+ * Creates a content handler that passes all XHTML body events to the
+ * given underlying content handler.
+ *
+ * @param handler content handler
+ */
+ public BodyContentHandler(ContentHandler handler) {
+ super(new MatchingContentHandler(handler, MATCHER));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public BodyContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given output stream using the default encoding.
+ *
+ * @param stream output stream
+ */
+ public BodyContentHandler(OutputStream stream) {
+ this(new WriteOutContentHandler(stream));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * an internal string buffer. The contents of the buffer can be retrieved
+ * using the {@link #toString()} method.
+ */
+ public BodyContentHandler() {
+ this(new WriteOutContentHandler());
+ }
+
+}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java Thu Apr 10 03:52:06 2008
@@ -94,4 +94,8 @@
handler.skippedEntity(name);
}
+ public String toString() {
+ return handler.toString();
+ }
+
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java Thu Apr 10 03:52:06 2008
@@ -17,6 +17,9 @@
package org.apache.tika.sax;
import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
import java.io.Writer;
import org.xml.sax.SAXException;
@@ -33,10 +36,39 @@
*/
private final Writer writer;
+ /**
+ * Creates a content handler that writes character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
public WriteOutContentHandler(Writer writer) {
this.writer = writer;
}
+ /**
+ * Creates a content handler that writes character events to
+ * the given output stream using the default encoding.
+ *
+ * @param stream output stream
+ */
+ public WriteOutContentHandler(OutputStream stream) {
+ this(new OutputStreamWriter(stream));
+ }
+
+ /**
+ * Creates a content handler that writes character events
+ * to an internal string buffer. Use the {@link #toString()}
+ * method to access the collected character content.
+ */
+ public WriteOutContentHandler() {
+ this(new StringWriter());
+ }
+
+ /**
+ * Writes the given characters to the given character stream.
+ */
+ @Override
public void characters(char[] ch, int start, int length)
throws SAXException {
try {
@@ -44,6 +76,18 @@
} catch (IOException e) {
throw new SAXException("Error writing out character content", e);
}
+ }
+
+ /**
+ * Returns the contents of the internal string buffer where
+ * all the received characters have been collected. Only works
+ * when this object was constructed using the empty default
+ * constructor or by passing a {@link StringWriter} to the
+ * other constructor.
+ */
+ @Override
+ public String toString() {
+ return writer.toString();
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java Thu Apr 10 03:52:06 2008
@@ -23,7 +23,6 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
@@ -33,7 +32,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMimeKeys;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
@@ -169,10 +169,9 @@
throws TikaException, IOException {
try {
Parser parser = config.getParser(mimeType);
- StringWriter writer = new StringWriter();
- parser.parse(
- stream, new WriteOutContentHandler(writer), new Metadata());
- return writer.toString();
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(stream, handler, new Metadata());
+ return handler.toString();
} catch (SAXException e) {
throw new TikaException("Unexpected SAX error", e);
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Thu Apr 10 03:52:06 2008
@@ -18,12 +18,11 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringWriter;
import org.apache.commons.lang.builder.ReflectionToStringBuilder;
import org.apache.commons.lang.builder.ToStringStyle;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import junit.framework.TestCase;
@@ -63,15 +62,14 @@
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
+ ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
assertEquals("Bad content type: " + tp,
tp.realType, metadata.get(Metadata.CONTENT_TYPE));
assertTrue("Expected content not found: " + tp,
- writer.toString().contains(tp.expectedContentFragment));
+ handler.toString().contains(tp.expectedContentFragment));
} finally {
input.close();
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Apr 10 03:52:06 2008
@@ -26,9 +26,12 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
public class HtmlParserTest extends TestCase {
@@ -40,32 +43,32 @@
}
public void testParseAscii() throws Exception {
- StringWriter writer = new StringWriter();
final StringWriter href = new StringWriter();
-
+
+ ContentHandler body = new BodyContentHandler();
+ ContentHandler link = new DefaultHandler() {
+ @Override
+ public void startElement(
+ String u, String l, String n, Attributes a)
+ throws SAXException {
+ if ("a".equals(l)) {
+ href.append(a.getValue("href"));
+ }
+ }
+ };
Metadata metadata = new Metadata();
- parser.parse(
- getStream("test-documents/testHTML.html"),
- new WriteOutContentHandler(writer) {
- @Override
- public void startElement(
- String uri, String local, String name,
- Attributes attributes) {
- if ("a".equals(local)) {
- href.append(attributes.getValue("href"));
- }
- }
- },
- metadata);
+ InputStream stream = getStream("test-documents/testHTML.html");
+ try {
+ parser.parse(stream, new TeeContentHandler(body, link), metadata);
+ } finally {
+ stream.close();
+ }
assertEquals(
"Title : Test Indexation Html", metadata.get(Metadata.TITLE));
assertEquals("http://www.apache.org/", href.toString());
- String content = writer.toString();
- assertTrue(
- "Did not contain expected text: Title : Test Indexation Html",
- content.contains("Title : Test Indexation Html"));
+ String content = body.toString();
assertTrue(
"Did not contain expected text:" + "Test Indexation Html",
content.contains("Test Indexation Html"));
@@ -76,13 +79,13 @@
}
public void XtestParseUTF8() throws IOException, SAXException, TikaException {
-
- StringWriter writer = new StringWriter();
+ ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- parser.parse(getStream("test-documents/testHTML_utf8.html"),
- new WriteOutContentHandler(writer), metadata);
- String content = writer.toString();
+ parser.parse(
+ getStream("test-documents/testHTML_utf8.html"),
+ handler, metadata);
+ String content = handler.toString();
assertTrue("Did not contain expected text:"
+ "Title : Tilte with UTF-8 chars öäå", content
@@ -100,8 +103,9 @@
public void testParseEmpty() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
- parser.parse(new ByteArrayInputStream(new byte[0]),
- new WriteOutContentHandler(writer), metadata);
+ parser.parse(
+ new ByteArrayInputStream(new byte[0]),
+ new BodyContentHandler(writer), metadata);
String content = writer.toString();
assertEquals("", content);
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,14 +17,13 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
-import java.io.StringWriter;
+
+import junit.framework.TestCase;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
-import junit.framework.TestCase;
-
public class ExcelParserTest extends TestCase {
public void testExcelParser() throws Exception {
@@ -32,8 +31,7 @@
"/test-documents/testEXCEL.xls");
try {
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
+ ContentHandler handler = new BodyContentHandler();
new OfficeParser().parse(input, handler, metadata);
assertEquals(
@@ -41,7 +39,7 @@
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- String content = writer.toString();
+ String content = handler.toString();
assertTrue(content.contains("Sample Excel Worksheet"));
assertTrue(content.contains("Numbers and their Squares"));
assertTrue(content.contains("9"));
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,10 +17,9 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
-import java.io.StringWriter;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import junit.framework.TestCase;
@@ -32,8 +31,7 @@
"/test-documents/testPPT.ppt");
try {
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
+ ContentHandler handler = new BodyContentHandler();
new OfficeParser().parse(input, handler, metadata);
assertEquals(
@@ -41,7 +39,7 @@
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- String content = writer.toString();
+ String content = handler.toString();
assertTrue(content.contains("Sample Powerpoint Slide"));
assertTrue(content.contains("Powerpoint X for Mac"));
} finally {
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,10 +17,9 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
-import java.io.StringWriter;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import junit.framework.TestCase;
@@ -31,9 +30,8 @@
InputStream input = WordParserTest.class.getResourceAsStream(
"/test-documents/testWORD.doc");
try {
+ ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
new OfficeParser().parse(input, handler, metadata);
assertEquals(
@@ -41,8 +39,7 @@
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- String content = writer.toString();
- assertTrue(content.contains("Sample Word Document"));
+ assertTrue(handler.toString().contains("Sample Word Document"));
} finally {
input.close();
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,12 +17,11 @@
package org.apache.tika.parser.opendocument;
import java.io.InputStream;
-import java.io.StringWriter;
import junit.framework.TestCase;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
public class OpenOfficeParserTest extends TestCase {
@@ -32,8 +31,7 @@
"/test-documents/testOpenOffice2.odt");
try {
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
+ ContentHandler handler = new BodyContentHandler();
new OpenOfficeParser().parse(input, handler, metadata);
assertEquals(
@@ -52,7 +50,7 @@
assertEquals("14", metadata.get("nbWord"));
assertEquals("78", metadata.get("nbCharacter"));
- String content = writer.toString();
+ String content = handler.toString();
assertTrue(content.contains(
"This is a sample Open Office document,"
+ " written in NeoOffice 2.2.1 for the Mac."));
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Thu Apr 10 03:52:06 2008
@@ -21,7 +21,9 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
import junit.framework.TestCase;
@@ -59,30 +61,24 @@
public void testUTF8Text() throws Exception {
String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+ ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
parser.parse(
new ByteArrayInputStream(text.getBytes("UTF-8")),
- new WriteOutContentHandler(writer),
- metadata);
- String content = writer.toString();
-
+ handler, metadata);
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
- assertTrue(content.contains(text));
+ assertTrue(handler.toString().contains(text));
}
public void testEmptyText() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
parser.parse(
- new ByteArrayInputStream(new byte[0]),
- new WriteOutContentHandler(writer),
- metadata);
- String content = writer.toString();
+ new ByteArrayInputStream(new byte[0]), handler, metadata);
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", content);
+ assertEquals("", handler.toString());
}
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=646748&r1=646747&r2=646748&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Thu Apr 10 03:52:06 2008
@@ -17,13 +17,13 @@
package org.apache.tika.parser.xml;
import java.io.InputStream;
-import java.io.StringWriter;
import junit.framework.TestCase;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
public class DcXMLParserTest extends TestCase {
@@ -32,8 +32,7 @@
"/test-documents/testXML.xml");
try {
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
+ ContentHandler handler = new BodyContentHandler();
new DcXMLParser().parse(input, handler, metadata);
assertEquals(
@@ -55,7 +54,7 @@
assertEquals("Fr", metadata.get(Metadata.LANGUAGE));
assertTrue(metadata.get(Metadata.RIGHTS).contains("testing chars"));
- String content = writer.toString();
+ String content = handler.toString();
assertTrue(content.contains("Tika test document"));
} finally {
input.close();
@@ -66,9 +65,7 @@
InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml");
try {
Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- ContentHandler handler = new WriteOutContentHandler(writer);
- new DcXMLParser().parse(input, handler, metadata);
+ new DcXMLParser().parse(input, new DefaultHandler(), metadata);
final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
assertEquals(expected,metadata.get(Metadata.RIGHTS));