You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/10/27 18:13:27 UTC
svn commit: r1189827 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/config/
tika-core/src/main/java/org/apache/tika/detect/
tika-core/src/main/java/org/apache/tika/io/
tika-core/src/main/java/org/apache/tika/metadata/ tika-core/src/main/java...
Author: jukka
Date: Thu Oct 27 16:13:26 2011
New Revision: 1189827
URL: http://svn.apache.org/viewvc?rev=1189827&view=rev
Log:
TIKA-703: Drop deprecated methods/classes/interfaces
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/io/
tika/trunk/tika-core/src/test/resources/org/apache/tika/io/test.txt
- copied, changed from r1189765, tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
Removed:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryFiles.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ExternalParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ParseUtils.java
tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/Parser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Thu Oct 27 16:13:26 2011
@@ -86,28 +86,10 @@ public class TikaConfig {
this(getBuilder().parse(stream));
}
- /**
- * @deprecated This method will be removed in Apache Tika 1.0
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
- */
- public TikaConfig(InputStream stream, Parser delegate)
- throws TikaException, IOException, SAXException {
- this(stream);
- }
-
public TikaConfig(Document document) throws TikaException, IOException {
this(document.getDocumentElement());
}
- /**
- * @deprecated This method will be removed in Apache Tika 1.0
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
- */
- public TikaConfig(Document document, Parser delegate)
- throws TikaException, IOException {
- this(document);
- }
-
public TikaConfig(Element element) throws TikaException, IOException {
this(element, ServiceLoader.getContextClassLoader());
}
@@ -165,8 +147,7 @@ public class TikaConfig {
this.mimeTypes = MimeTypes.getDefaultMimeTypes();
this.parser = new DefaultParser(
mimeTypes.getMediaTypeRegistry(), loader);
- this.detector = new DefaultDetector(
- MimeTypes.getDefaultMimeTypes(), loader);
+ this.detector = new DefaultDetector(mimeTypes, loader);
} else {
InputStream stream;
File file = new File(config);
@@ -198,15 +179,6 @@ public class TikaConfig {
}
}
- /**
- * @deprecated This method will be removed in Apache Tika 1.0
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
- */
- public TikaConfig(Element element, Parser delegate)
- throws TikaException, IOException {
- this(element);
- }
-
private static String getText(Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
return node.getNodeValue();
@@ -223,13 +195,6 @@ public class TikaConfig {
}
/**
- * @deprecated Use the {@link #getParser()} method instead
- */
- public Parser getParser(MediaType mimeType) {
- return parser.getParsers().get(mimeType);
- }
-
- /**
* Returns the configured parser instance.
*
* @return configured parser
@@ -239,13 +204,6 @@ public class TikaConfig {
}
/**
- * @deprecated Use the {@link #getParser()} method instead
- */
- public Map<MediaType, Parser> getParsers() {
- return parser.getParsers();
- }
-
- /**
* Returns the configured detector instance.
*
* @return configured detector
@@ -281,15 +239,6 @@ public class TikaConfig {
}
}
- /**
- * @deprecated This method will be removed in Apache Tika 1.0
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
- */
- public static TikaConfig getDefaultConfig(Parser delegate)
- throws TikaException {
- return getDefaultConfig();
- }
-
private static DocumentBuilder getBuilder() throws TikaException {
try {
return DocumentBuilderFactory.newInstance().newDocumentBuilder();
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Thu Oct 27 16:13:26 2011
@@ -119,13 +119,6 @@ public class TikaInputStream extends Tag
}
/**
- * @deprecated Use the {@link #get(InputStream, TemporaryResources)} instead
- */
- public static TikaInputStream get(InputStream stream, TemporaryFiles tmp) {
- return get(stream, (TemporaryResources) tmp);
- }
-
- /**
* Casts or wraps the given stream to a TikaInputStream instance.
* This method can be used to access the functionality of this class
* even when given just a normal input stream instance.
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Thu Oct 27 16:13:26 2011
@@ -18,13 +18,11 @@ package org.apache.tika.mime;
// JDK imports
import java.io.ByteArrayInputStream;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
-import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
@@ -34,7 +32,6 @@ import java.util.TreeSet;
import javax.xml.namespace.QName;
-import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.TextDetector;
import org.apache.tika.detect.XmlRootExtractor;
@@ -120,40 +117,13 @@ public final class MimeTypes implements
}
/**
- * Find the Mime Content Type of a file.
- *
- * @deprecated Use the {@link Tika#detect(File)} method
- * @param file
- * to analyze.
- * @return the Mime Content Type of the specified file, or <code>null</code>
- * if none is found.
- */
- public MimeType getMimeType(File file) {
- return getMimeType(file.getName());
- }
-
- /**
- * Find the Mime Content Type of a document from its URL.
- *
- * @deprecated Use the {@link Tika#detect(URL)} method
- * @param url
- * of the document to analyze.
- * @return the Mime Content Type of the specified document URL, or
- * <code>null</code> if none is found.
- */
- public MimeType getMimeType(URL url) {
- return getMimeType(url.getPath());
- }
-
- /**
* Find the Mime Content Type of a document from its name.
* Returns application/octet-stream if no better match is found.
*
- * @deprecated Use the {@link Tika#detect(String)} method
* @param name of the document to analyze.
* @return the Mime Content Type of the specified document name
*/
- public MimeType getMimeType(String name) {
+ private MimeType getMimeType(String name) {
MimeType type = patterns.matches(name);
if (type != null) {
return type;
@@ -174,11 +144,10 @@ public final class MimeTypes implements
* The given byte array is expected to be at least {@link #getMinLength()}
* long, or shorter only if the document stream itself is shorter.
*
- * @deprecated Use the {@link Tika#detect(byte[])} method
* @param data first few bytes of a document stream
* @return matching MIME type
*/
- public MimeType getMimeType(byte[] data) {
+ private MimeType getMimeType(byte[] data) {
if (data == null) {
throw new IllegalArgumentException("Data is missing");
} else if (data.length == 0) {
@@ -232,19 +201,6 @@ public final class MimeTypes implements
}
/**
- * Returns the MIME type that best matches the first few bytes of the
- * given document stream.
- *
- * @deprecated Use the {@link Tika#detect(InputStream)} method
- * @param stream document stream
- * @return matching MIME type, or <code>null</code> if no match is found
- * @throws IOException if the stream can be read
- */
- public MimeType getMimeType(InputStream stream) throws IOException {
- return getMimeType(readMagicHeader(stream));
- }
-
- /**
* Reads the first {@link #getMinLength()} bytes from the given stream.
* If the stream is shorter, then the entire content of the stream is
* returned.
@@ -280,98 +236,6 @@ public final class MimeTypes implements
}
/**
- * @deprecated Use the {@link Tika#detect(InputStream, Metadata))} method
- */
- public String getType(String typeName, String url, byte[] data) {
- try {
- Metadata metadata = new Metadata();
- if (url != null) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, url);
- }
- if (typeName != null) {
- metadata.set(Metadata.CONTENT_TYPE, typeName);
- }
- return detect(new ByteArrayInputStream(data), metadata).toString();
- } catch (IOException e) {
- throw new IllegalStateException(
- "ByteArrayInputStream throws an IOException!", e);
- }
- }
-
- /**
- * Determines the MIME type of the resource pointed to by the specified URL.
- * Examines the file's header, and if it cannot determine the MIME type
- * from the header, guesses the MIME type from the URL extension
- * (e.g. "pdf).
- *
- * @deprecated Use the {@link Tika#detect(URL)} method
- * @param url URL of the document
- * @return type of the document
- * @throws IOException if the document can not be accessed
- */
- public String getType(URL url) throws IOException {
- InputStream stream = url.openStream();
- try {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, url.toString());
- return detect(stream, metadata).toString();
- } finally {
- stream.close();
- }
- }
-
- /**
- * Find the Mime Content Type of a document from its name and its content.
- * The policy used to guess the Mime Content Type is:
- * <ol>
- * <li>Try to find the type based on the provided data.</li>
- * <li>If a type is found, then return it, otherwise try to find the type
- * based on the file name</li>
- * </ol>
- *
- *
- * @deprecated Use the {@link Tika#detect(byte[], String)} method
- * @param name
- * of the document to analyze.
- * @param data
- * are the first bytes of the document's content.
- * @return the Mime Content Type of the specified document, or
- * <code>null</code> if none is found.
- * @see #getMinLength()
- */
- public MimeType getMimeType(String name, byte[] data) {
- // First, try to get the mime-type from the content
- MimeType dataType = getMimeType(data);
-
- // Then, try to get the mime-type from the document name
- MimeType nameType = getMimeType(name);
-
- // Use the more specific of the two types
- if (registry.isSpecializationOf(
- nameType.getType(), dataType.getType())) {
- return nameType;
- } else {
- return dataType;
- }
- }
-
- /**
- * Returns the MIME type that best matches the given document name and
- * the first few bytes of the given document stream.
- *
- * @deprecated Use the {@link Tika#detect(InputStream,String)} method
- * @see #getMimeType(String, byte[])
- * @param name document name
- * @param stream document stream
- * @return matching MIME type, or <code>null</code> if no match is found
- * @throws IOException if the stream can not be read
- */
- public MimeType getMimeType(String name, InputStream stream)
- throws IOException {
- return getMimeType(name, readMagicHeader(stream));
- }
-
- /**
* Returns the registered media type with the given name (or alias).
* The named media type is automatically registered (and returned) if
* it doesn't already exist.
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java Thu Oct 27 16:13:26 2011
@@ -40,7 +40,12 @@ public abstract class AbstractParser imp
/**
* Calls the
* {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)}
- * method with an empty {@link ParseContext}.
+ * method with an empty {@link ParseContext}. This method exists as a
+ * leftover from Tika 0.x when the three-argument parse() method still
+ * existed in the {@link Parser} interface. No new code should call this
+ * method anymore, it's only here for backwards compatibility.
+ *
+ * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method instead
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Thu Oct 27 16:13:26 2011
@@ -80,15 +80,6 @@ public class AutoDetectParser extends Co
}
/**
- * @deprecated This method will be removed in Tika 1.0
- */
- public void setConfig(TikaConfig config) {
- setParsers(config.getParsers());
- setDetector(config.getDetector());
- setMediaTypeRegistry(config.getMediaTypeRegistry());
- }
-
- /**
* Returns the type detector used by this parser to auto-detect the type
* of a document.
*
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java Thu Oct 27 16:13:26 2011
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser;
-import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
@@ -25,7 +24,6 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
/**
* Dummy parser that always throws a {@link TikaException} without even
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/Parser.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/Parser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/Parser.java Thu Oct 27 16:13:26 2011
@@ -67,16 +67,4 @@ public interface Parser extends Serializ
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException;
- /**
- * The parse() method from Tika 0.4 and earlier. Please use the
- * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)}
- * method instead in new code. Calls to this backwards compatibility
- * method are forwarded to the new parse() method with an empty parse
- * context.
- *
- * @deprecated This method will be removed in Apache Tika 1.0.
- */
- void parse(InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException;
-
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java Thu Oct 27 16:13:26 2011
@@ -101,7 +101,8 @@ public class ParsingReader extends Reade
* @throws IOException if the document can not be parsed
*/
public ParsingReader(InputStream stream) throws IOException {
- this(new AutoDetectParser(), stream, new Metadata());
+ this(new AutoDetectParser(), stream, new Metadata(), new ParseContext());
+ context.set(Parser.class, parser);
}
/**
@@ -113,7 +114,8 @@ public class ParsingReader extends Reade
* @throws IOException if the document can not be parsed
*/
public ParsingReader(InputStream stream, String name) throws IOException {
- this(new AutoDetectParser(), stream, getMetadata(name));
+ this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext());
+ context.set(Parser.class, parser);
}
/**
@@ -203,27 +205,6 @@ public class ParsingReader extends Reade
}
/**
- * @deprecated This method will be removed in Apache Tika 1.0
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
- */
- public ParsingReader(Parser parser, InputStream stream, Metadata metadata)
- throws IOException {
- this(parser, stream, metadata, new ParseContext());
- context.set(Parser.class, parser);
- }
-
- /**
- * @deprecated This method will be removed in Apache Tika 1.0
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
- */
- public ParsingReader(
- Parser parser, InputStream stream, Metadata metadata,
- Executor executor) throws IOException {
- this(parser, stream, metadata, new ParseContext(), executor);
- context.set(Parser.class, parser);
- }
-
- /**
* The background parsing task.
*/
private class ParsingTask implements Runnable {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java Thu Oct 27 16:13:26 2011
@@ -188,13 +188,6 @@ public class SafeContentHandler extends
}
/**
- * @deprecated Use {@link #isInvalid(int)} instead
- */
- protected boolean isInvalid(char ch) {
- return isInvalid((int) ch);
- }
-
- /**
* Outputs the replacement for an invalid character. Subclasses can
* override this method to use a custom replacement.
*
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java Thu Oct 27 16:13:26 2011
@@ -24,6 +24,9 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.net.URL;
+
+import org.apache.tika.metadata.Metadata;
import junit.framework.TestCase;
@@ -103,4 +106,14 @@ public class TikaInputStreamTest extends
return buffer.toString("UTF-8");
}
+ public void testGetMetadata() throws Exception {
+ URL url = TikaInputStreamTest.class.getResource("test.txt");
+ Metadata metadata = new Metadata();
+ TikaInputStream.get(url, metadata).close();
+ assertEquals("test.txt", metadata.get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals(
+ Long.toString(new File(url.toURI()).length()),
+ metadata.get(Metadata.CONTENT_LENGTH));
+ }
+
}
Copied: tika/trunk/tika-core/src/test/resources/org/apache/tika/io/test.txt (from r1189765, tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/io/test.txt?p2=tika/trunk/tika-core/src/test/resources/org/apache/tika/io/test.txt&p1=tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt&r1=1189765&r2=1189827&rev=1189827&view=diff
==============================================================================
(empty)
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Thu Oct 27 16:13:26 2011
@@ -22,9 +22,8 @@ import java.io.InputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.ParseUtils;
import org.xml.sax.helpers.DefaultHandler;
/**
@@ -34,62 +33,21 @@ public class TestParsers extends TikaTes
private TikaConfig tc;
+ private Tika tika;
+
public void setUp() throws Exception {
tc = TikaConfig.getDefaultConfig();
- }
-
- public void testPDFExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testPDF.pdf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
- String s3 = ParseUtils.getStringContent(file, TikaConfig
- .getDefaultConfig());
- assertEquals(s1, s2);
- assertEquals(s1, s3);
- }
-
- public void testTXTExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testTXT.txt");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
- assertEquals(s1, s2);
- }
-
- public void testXMLExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testXML.xml");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/xml");
- assertEquals(s1, s2);
- }
-
- public void testPPTExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testPPT.ppt");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc,
- "application/vnd.ms-powerpoint");
- assertEquals(s1, s2);
- Parser parser =
- tc.getParser(MediaType.parse("application/vnd.ms-powerpoint"));
- Metadata metadata = new Metadata();
- InputStream stream = new FileInputStream(file);
- try {
- parser.parse(stream, new DefaultHandler(), metadata);
- } finally {
- stream.close();
- }
- assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
+ tika = new Tika(tc);
}
public void testWORDxtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testWORD.doc");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/msword");
- assertEquals(s1, s2);
- Parser parser = tc.getParser(MediaType.parse("application/msword"));
+ Parser parser = tika.getParser();
Metadata metadata = new Metadata();
InputStream stream = new FileInputStream(file);
try {
- parser.parse(stream, new DefaultHandler(), metadata);
+ parser.parse(
+ stream, new DefaultHandler(), metadata, new ParseContext());
} finally {
stream.close();
}
@@ -99,75 +57,27 @@ public class TestParsers extends TikaTes
public void testEXCELExtraction() throws Exception {
final String expected = "Numbers and their Squares";
File file = getResourceAsFile("/test-documents/testEXCEL.xls");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc,
- "application/vnd.ms-excel");
- assertEquals(s1, s2);
+ String s1 = tika.parseToString(file);
assertTrue("Text does not contain '" + expected + "'", s1
.contains(expected));
- Parser parser =
- tc.getParser(MediaType.parse("application/vnd.ms-excel"));
+ Parser parser = tika.getParser();
Metadata metadata = new Metadata();
InputStream stream = new FileInputStream(file);
try {
- parser.parse(stream, new DefaultHandler(), metadata);
+ parser.parse(
+ stream, new DefaultHandler(), metadata, new ParseContext());
} finally {
stream.close();
}
assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
}
- public void testOOExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testOpenOffice2.odt");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc,
- "application/vnd.oasis.opendocument.text");
- assertEquals(s1, s2);
- }
-
- public void testOutlookExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/test-outlook.msg");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc,
- "application/vnd.ms-outlook");
- assertEquals(s1, s2);
- }
-
- public void testHTMLExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testHTML.html");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "text/html");
- assertEquals(s1, s2);
-
- Parser parser = tc.getParser(MediaType.parse("text/html"));
- assertNotNull(parser);
- }
-
- public void testZipFileExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/test-documents.zip");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
- assertEquals(s1, s2);
-
- Parser parser = tc.getParser(MediaType.parse("application/zip"));
- assertNotNull(parser);
- }
-
- public void testMP3Extraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
- assertEquals(s1, s2);
-
- Parser parser = tc.getParser(MediaType.parse("audio/mpeg"));
- assertNotNull(parser);
- }
-
public void testOptionalHyphen() throws Exception {
- final String[] extensions = new String[] {"ppt", "pptx", "doc", "docx", "rtf", "pdf"};
- for(String extension : extensions) {
+ String[] extensions =
+ new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"};
+ for (String extension : extensions) {
File file = getResourceAsFile("/test-documents/testOptionalHyphen." + extension);
- String content = ParseUtils.getStringContent(file, tc);
+ String content = tika.parseToString(file);
assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content,
content.contains("optionalhyphen") ||
content.contains("optional\u00adhyphen") || // soft hyphen
@@ -179,7 +89,7 @@ public class TestParsers extends TikaTes
private void verifyComment(String extension, String fileName) throws Exception {
File file = getResourceAsFile("/test-documents/" + fileName + "." + extension);
- String content = ParseUtils.getStringContent(file, tc);
+ String content = tika.parseToString(file);
assertTrue(extension + ": content=" + content + " did not extract text",
content.contains("Here is some text"));
assertTrue(extension + ": content=" + content + " did not extract comment",
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Thu Oct 27 16:13:26 2011
@@ -27,15 +27,13 @@ import org.apache.poi.poifs.filesystem.N
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypes;
/**
* Junit test class for {@link ContainerAwareDetector}
*/
public class TestContainerAwareDetector extends TestCase {
- private final Detector detector =
- new ContainerAwareDetector(MimeTypes.getDefaultMimeTypes());
+ private final Detector detector = new DefaultDetector();
private void assertDetect(String file, String type) throws Exception {
TikaInputStream stream = TikaInputStream.get(
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java Thu Oct 27 16:13:26 2011
@@ -19,8 +19,10 @@ package org.apache.tika.mime;
import static org.apache.tika.mime.MediaType.OCTET_STREAM;
import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
-import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
import junit.framework.TestCase;
@@ -93,15 +95,9 @@ public class MimeTypesTest extends TestC
assertTrue(html.compareTo(html) == 0);
}
- /** Test getMimeType(byte[]) */
- public void testGetMimeType_byteArray() {
- try {
- types.getMimeType((byte[])null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
-
+ /** Test getMimeType(byte[])
+ * @throws IOException */
+ public void testGetMimeType_byteArray() throws IOException {
// Plain text detection
assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
@@ -111,28 +107,19 @@ public class MimeTypesTest extends TestC
assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
}
- private void assertText(byte[] prefix) {
+ private void assertText(byte[] prefix) throws IOException {
assertMagic("text/plain", prefix);
}
- private void assertNotText(byte[] prefix) {
+ private void assertNotText(byte[] prefix) throws IOException {
assertMagic("application/octet-stream", prefix);
}
- private void assertMagic(String expected, byte[] prefix) {
- MimeType type = types.getMimeType(prefix);
+ private void assertMagic(String expected, byte[] prefix) throws IOException {
+ MediaType type =
+ types.detect(new ByteArrayInputStream(prefix), new Metadata());
assertNotNull(type);
- assertEquals(expected, type.getName());
- }
-
- /** Test getMimeType(InputStream) */
- public void testGetMimeType_InputStream() throws IOException {
- try {
- types.getMimeType((InputStream)null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
+ assertEquals(expected, type.toString());
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu Oct 27 16:13:26 2011
@@ -21,11 +21,11 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
-import java.net.MalformedURLException;
import java.net.URL;
import junit.framework.TestCase;
+import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
@@ -36,35 +36,27 @@ import org.apache.tika.metadata.Metadata
*/
public class TestMimeTypes extends TestCase {
- private MimeTypes repo;
+ private Tika tika;
- private static URL u;
+ private MimeTypes repo;
- static {
- try {
- u = new URL("http://mydomain.com/x.pdf?x=y");
- } catch (MalformedURLException e) {
- fail(e.getMessage());
- }
- }
+ private URL u;
private static final File f = new File("/a/b/c/x.pdf");
- public TestMimeTypes() {
- try {
- repo = TikaConfig.getDefaultConfig().getMimeRepository();
- } catch (Exception e) {
- fail(e.getMessage());
- }
-
+ protected void setUp() throws Exception{
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ repo = config.getMimeRepository();
+ tika = new Tika(config);
+ u = new URL("http://mydomain.com/x.pdf?x=y");
}
public void testCaseSensitivity() {
- MimeType type = repo.getMimeType("test.PDF");
+ String type = tika.detect("test.PDF");
assertNotNull(type);
- assertEquals(repo.getMimeType("test.pdf"), type);
- assertEquals(repo.getMimeType("test.PdF"), type);
- assertEquals(repo.getMimeType("test.pdF"), type);
+ assertEquals(type, tika.detect("test.pdf"));
+ assertEquals(type, tika.detect("test.PdF"));
+ assertEquals(type, tika.detect("test.pdF"));
}
public void testLoadMimeTypes() throws MimeTypeException {
@@ -77,8 +69,8 @@ public class TestMimeTypes extends TestC
*/
public void testGuessMimeTypes() throws Exception {
assertTypeByName("application/pdf", "x.pdf");
- assertEquals("application/pdf", repo.getMimeType(u).getName());
- assertEquals("application/pdf", repo.getMimeType(f).getName());
+ assertEquals("application/pdf", tika.detect(u.toExternalForm()));
+ assertEquals("application/pdf", tika.detect(f.getPath()));
assertTypeByName("text/plain", "x.txt");
assertTypeByName("text/html", "x.htm");
assertTypeByName("text/html", "x.html");
@@ -361,15 +353,13 @@ public class TestMimeTypes extends TestC
String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
this.repo.addPattern(testType, pattern, true);
String testFileName = "rtg_sst_grb_0.5.12345678";
- assertNotNull(this.repo.getMimeType(testFileName));
- assertEquals(this.repo.getMimeType(testFileName).getName(), "foo/bar");
+ assertEquals("foo/bar", tika.detect(testFileName));
MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
this.repo.add(testType2);
assertNotNull(repo.forName("foo/bar2"));
this.repo.addPattern(testType2, pattern, false);
- assertNotNull(this.repo.getMimeType(testFileName));
- assertNotSame("foo/bar2", this.repo.getMimeType(testFileName).getName());
+ assertNotSame("foo/bar2", tika.detect(testFileName));
}
public void testRawDetection() throws Exception {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java Thu Oct 27 16:13:26 2011
@@ -79,8 +79,8 @@ public class ParsingReaderTest extends T
Metadata metadata = new Metadata();
InputStream stream = ParsingReaderTest.class.getResourceAsStream(
"/test-documents/testEXCEL.xls");
- Reader reader =
- new ParsingReader(new AutoDetectParser(), stream, metadata);
+ Reader reader = new ParsingReader(
+ new AutoDetectParser(), stream, metadata, new ParseContext());
try {
// Metadata should already be available
assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java Thu Oct 27 16:13:26 2011
@@ -21,6 +21,7 @@ import java.io.InputStream;
//TIKA imports
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.hdf.HDFParser;
import org.apache.tika.sax.BodyContentHandler;
@@ -54,7 +55,7 @@ public class HDFParserTest extends TestC
InputStream stream = HDFParser.class
.getResourceAsStream("/test-documents/test.he5");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, new ParseContext());
} finally {
stream.close();
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Thu Oct 27 16:13:26 2011
@@ -17,6 +17,8 @@
package org.apache.tika.parser.image;
import junit.framework.TestCase;
+
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.metadata.Metadata;
@@ -34,7 +36,7 @@ public class TiffParserTest extends Test
metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
InputStream stream =
getClass().getResourceAsStream("/test-documents/testTIFF.tif");
- parser.parse(stream, new DefaultHandler(), metadata);
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
"more contributor license agreements. See the NOTICE file " +
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java Thu Oct 27 16:13:26 2011
@@ -16,13 +16,13 @@
*/
package org.apache.tika.parser.microsoft;
-import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -35,9 +35,7 @@ public class TNEFParserTest extends Abst
public void testBasics() throws Exception {
TikaInputStream stream = getTestFile(file);
- ContainerAwareDetector detector =
- new ContainerAwareDetector(MimeTypes.getDefaultMimeTypes());
-
+ Detector detector = new DefaultDetector();
try {
assertEquals(
MediaType.application("vnd.ms-tnef"),
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu Oct 27 16:13:26 2011
@@ -26,8 +26,6 @@ import javax.xml.transform.sax.Transform
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.ContainerAwareDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
@@ -38,16 +36,8 @@ import org.apache.tika.sax.BodyContentHa
import org.xml.sax.ContentHandler;
public class OOXMLParserTest extends TikaTest {
- private Parser parser;
-
- @Override
- protected void setUp() throws Exception {
- TikaConfig config = TikaConfig.getDefaultConfig();
- ContainerAwareDetector detector = new ContainerAwareDetector(
- config.getMimeRepository()
- );
- parser = new AutoDetectParser(detector);
- }
+
+ private Parser parser = new AutoDetectParser();
public void testExcel() throws Exception {
InputStream input = OOXMLParserTest.class
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java Thu Oct 27 16:13:26 2011
@@ -21,6 +21,7 @@ import java.io.InputStream;
//TIKA imports
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -46,7 +47,7 @@ public class NetCDFParserTest extends Te
InputStream stream = NetCDFParser.class
.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, new ParseContext());
} finally {
stream.close();
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1189827&r1=1189826&r2=1189827&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Thu Oct 27 16:13:26 2011
@@ -26,19 +26,18 @@ import javax.xml.transform.sax.SAXTransf
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
+import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.utils.ParseUtils;
/**
* Junit test class for the Tika {@link RTFParser}
*/
public class RTFParserTest extends TikaTest {
- private RTFParser parser;
- private static final TikaConfig defaultConfig = TikaConfig.getDefaultConfig();
+
+ private Tika tika = new Tika();
private static class Result {
public final String text;
@@ -50,16 +49,12 @@ public class RTFParserTest extends TikaT
}
}
- public void setUp() throws Exception {
- parser = new RTFParser();
- }
-
public void testBasicExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTF.rtf");
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
- parser.parse(
+ tika.getParser().parse(
new FileInputStream(file),
new WriteOutContentHandler(writer),
metadata,
@@ -100,10 +95,7 @@ public class RTFParserTest extends TikaT
public void testTableCellSeparation() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf");
- String s1 = ParseUtils.getStringContent(file, defaultConfig);
- String s2 = ParseUtils.getStringContent(file, defaultConfig, "application/rtf");
- assertEquals(s1, s2);
- String content = s1;
+ String content = tika.parseToString(file);
content = content.replaceAll("\\s+"," ");
assertTrue(content.contains("a b c d \u00E4 \u00EB \u00F6 \u00FC"));
assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
@@ -118,34 +110,21 @@ public class RTFParserTest extends TikaT
public void testWordPadCzechCharactersExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf");
- String s1 = ParseUtils.getStringContent(file, defaultConfig);
- String s2 = ParseUtils.getStringContent(file, defaultConfig, "application/rtf");
- assertEquals(s1, s2);
+ String s1 = tika.parseToString(file);
assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
}
public void testWord2010CzechCharactersExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf");
- String s1 = ParseUtils.getStringContent(file, defaultConfig);
- String s2 = ParseUtils.getStringContent(file, defaultConfig, "application/rtf");
- assertEquals(s1, s2);
+ String s1 = tika.parseToString(file);
assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
}
- public void testExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTF.rtf");
- String s1 = ParseUtils.getStringContent(file, defaultConfig);
- String s2 = ParseUtils.getStringContent(file, defaultConfig, "application/rtf");
- assertEquals(s1, s2);
- }
-
public void testMS932Extraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
- String s1 = ParseUtils.getStringContent(file, defaultConfig);
- String s2 = ParseUtils.getStringContent(file, defaultConfig, "application/rtf");
- assertEquals(s1, s2);
+ String s1 = tika.parseToString(file);
// Hello in Japanese
assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
@@ -157,17 +136,15 @@ public class RTFParserTest extends TikaT
public void testUmlautSpacesExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf");
- String s1 = ParseUtils.getStringContent(file, defaultConfig);
- String s2 = ParseUtils.getStringContent(file, defaultConfig, "application/rtf");
- assertEquals(s1, s2);
+ String s1 = tika.parseToString(file);
assertTrue(s1.contains("\u00DCbersicht"));
}
public void testGothic() throws Exception {
- String content = getText("testRTFUnicodeGothic.rtf");
- assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+ String content = getText("testRTFUnicodeGothic.rtf");
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
}
-
+
public void testJapaneseText() throws Exception {
Result r = getResult("testRTFJapanese.rtf");
String content = r.text;
@@ -302,7 +279,7 @@ public class RTFParserTest extends TikaT
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
- parser.parse(
+ tika.getParser().parse(
new FileInputStream(file),
new WriteOutContentHandler(writer),
metadata,
@@ -335,7 +312,7 @@ public class RTFParserTest extends TikaT
// Try with a document containing various tables and formattings
InputStream input = getResourceAsStream("/test-documents/" + filename);
try {
- parser.parse(input, handler, metadata, new ParseContext());
+ tika.getParser().parse(input, handler, metadata, new ParseContext());
return new XMLResult(sw.toString(), metadata);
} finally {
input.close();