You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/02/09 17:25:09 UTC
svn commit: r1658449 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/parser/ParserDecorator.java
test/java/org/apache/tika/parser/DummyParser.java
test/java/org/apache/tika/parser/ParserDecoratorTest.java
Author: nick
Date: Mon Feb 9 16:25:09 2015
New Revision: 1658449
URL: http://svn.apache.org/r1658449
Log:
TIKA-1509 Provide a possible "parser with fallback" implementation, with lots of questions!
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1658449&r1=1658448&r2=1658449&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Mon Feb 9 16:25:09 2015
@@ -18,10 +18,12 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.xml.sax.ContentHandler;
@@ -81,6 +83,50 @@ public class ParserDecorator extends Abs
}
};
}
+
+ /**
+ * Decorates the given parsers into a virtual parser, where they'll
+ * be tried in preference order until one works without error.
+ * TODO Is this the right name?
+ * TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
+ * TODO Should we reset the Metadata if we try another parser?
+ * TODO Should we reset the ContentHandler if we try another parser?
+ * TODO Should we log/report failures anywhere?
+ * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
+ */
+ public static final Parser withFallbacks(
+ final Collection<? extends Parser> parsers, final Set<MediaType> types) {
+ Parser parser = EmptyParser.INSTANCE;
+ if (!parsers.isEmpty()) parser = parsers.iterator().next();
+
+ return new ParserDecorator(parser) {
+ private static final long serialVersionUID = 1625187131782069683L;
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return types;
+ }
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Must have a TikaInputStream, so we can re-use it if parsing fails
+ TikaInputStream tstream = TikaInputStream.get(stream);
+ tstream.getFile();
+ // Try each parser in turn
+ for (Parser p : parsers) {
+ tstream.mark(-1);
+ try {
+ p.parse(tstream, handler, metadata, context);
+ return;
+ } catch (Exception e) {
+ // TODO How to log / record this failure?
+ }
+ // Prepare for the next parser, if present
+ tstream.reset();
+ }
+ }
+ };
+ }
/**
* The decorated parser instance.
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java?rev=1658449&r1=1658448&r2=1658449&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java Mon Feb 9 16:25:09 2015
@@ -25,6 +25,7 @@ import java.util.Map.Entry;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -54,11 +55,12 @@ public class DummyParser extends Abstrac
metadata.add(m.getKey(), m.getValue());
}
- handler.startDocument();
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
if (xmlText != null) {
- handler.characters(xmlText.toCharArray(), 0, xmlText.length());
+ xhtml.characters(xmlText.toCharArray(), 0, xmlText.length());
}
- handler.endDocument();
+ xhtml.endDocument();
}
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java?rev=1658449&r1=1658448&r2=1658449&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java Mon Feb 9 16:25:09 2015
@@ -18,12 +18,16 @@ package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
public class ParserDecoratorTest {
@@ -71,4 +75,46 @@ public class ParserDecoratorTest {
assertEquals(1, types.size());
assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
}
+
+ /**
+ * Testing one proposed implementation for TIKA-1509
+ */
+ @Test
+ public void withFallback() throws Exception {
+ Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+ Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+ MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+
+ ParseContext context = new ParseContext();
+ BodyContentHandler handler;
+ Metadata metadata;
+
+ ErrorParser pFail = new ErrorParser();
+ DummyParser pWork = new DummyParser(onlyOct, new HashMap<String,String>(), "Fell back!");
+ EmptyParser pNothing = new EmptyParser();
+
+ // Create a combination which will fail first
+ @SuppressWarnings("deprecation")
+ Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
+
+ // Will claim to support the types given, not those on the child parsers
+ Set<MediaType> types = p.getSupportedTypes(context);
+ assertEquals(2, types.size());
+ assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
+ assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
+
+ // Parsing will make it to the second one
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+
+ // With a parser that will work with no output, will get nothing
+ p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("", handler.toString());
+ }
}
Re: svn commit: r1658449 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/parser/ParserDecorator.java
test/java/org/apache/tika/parser/DummyParser.java
test/java/org/apache/tika/parser/ParserDecoratorTest.java
Posted by "Mattmann, Chris A (3980)" <ch...@jpl.nasa.gov>.
You da man Nick
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Chief Architect
Instrument Software and Science Data Systems Section (398)
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 168-519, Mailstop: 168-527
Email: chris.a.mattmann@nasa.gov
WWW: http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Associate Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-----Original Message-----
From: "nick@apache.org" <ni...@apache.org>
Reply-To: "dev@tika.apache.org" <de...@tika.apache.org>
Date: Monday, February 9, 2015 at 8:25 AM
To: "commits@tika.apache.org" <co...@tika.apache.org>
Subject: svn commit: r1658449 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/parser/ParserDecorator.java
test/java/org/apache/tika/parser/DummyParser.java
test/java/org/apache/tika/parser/ParserDecoratorTest.java
>Author: nick
>Date: Mon Feb 9 16:25:09 2015
>New Revision: 1658449
>
>URL: http://svn.apache.org/r1658449
>Log:
>TIKA-1509 Provide a possible "parser with fallback" implementation, with
>lots of questions!
>
>Modified:
>
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java
>
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
>
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java
>
>Modified:
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java
>URL:
>http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache
>/tika/parser/ParserDecorator.java?rev=1658449&r1=1658448&r2=1658449&view=d
>iff
>==========================================================================
>====
>---
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java (original)
>+++
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java Mon Feb 9 16:25:09 2015
>@@ -18,10 +18,12 @@ package org.apache.tika.parser;
>
> import java.io.IOException;
> import java.io.InputStream;
>+import java.util.Collection;
> import java.util.HashSet;
> import java.util.Set;
>
> import org.apache.tika.exception.TikaException;
>+import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
> import org.xml.sax.ContentHandler;
>@@ -81,6 +83,50 @@ public class ParserDecorator extends Abs
> }
> };
> }
>+
>+ /**
>+ * Decorates the given parsers into a virtual parser, where they'll
>+ * be tried in preference order until one works without error.
>+ * TODO Is this the right name?
>+ * TODO Is this the right place to put this? Should it be in
>CompositeParser? Elsewhere?
>+ * TODO Should we reset the Metadata if we try another parser?
>+ * TODO Should we reset the ContentHandler if we try another parser?
>+ * TODO Should we log/report failures anywhere?
>+ * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
>+ */
>+ public static final Parser withFallbacks(
>+ final Collection<? extends Parser> parsers, final
>Set<MediaType> types) {
>+ Parser parser = EmptyParser.INSTANCE;
>+ if (!parsers.isEmpty()) parser = parsers.iterator().next();
>+
>+ return new ParserDecorator(parser) {
>+ private static final long serialVersionUID =
>1625187131782069683L;
>+ @Override
>+ public Set<MediaType> getSupportedTypes(ParseContext
>context) {
>+ return types;
>+ }
>+ @Override
>+ public void parse(InputStream stream, ContentHandler handler,
>+ Metadata metadata, ParseContext context)
>+ throws IOException, SAXException, TikaException {
>+ // Must have a TikaInputStream, so we can re-use it if
>parsing fails
>+ TikaInputStream tstream = TikaInputStream.get(stream);
>+ tstream.getFile();
>+ // Try each parser in turn
>+ for (Parser p : parsers) {
>+ tstream.mark(-1);
>+ try {
>+ p.parse(tstream, handler, metadata, context);
>+ return;
>+ } catch (Exception e) {
>+ // TODO How to log / record this failure?
>+ }
>+ // Prepare for the next parser, if present
>+ tstream.reset();
>+ }
>+ }
>+ };
>+ }
>
> /**
> * The decorated parser instance.
>
>Modified:
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
>URL:
>http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache
>/tika/parser/DummyParser.java?rev=1658449&r1=1658448&r2=1658449&view=diff
>==========================================================================
>====
>---
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
> (original)
>+++
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
> Mon Feb 9 16:25:09 2015
>@@ -25,6 +25,7 @@ import java.util.Map.Entry;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
>+import org.apache.tika.sax.XHTMLContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
>
>@@ -54,11 +55,12 @@ public class DummyParser extends Abstrac
> metadata.add(m.getKey(), m.getValue());
> }
>
>- handler.startDocument();
>+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
>metadata);
>+ xhtml.startDocument();
> if (xmlText != null) {
>- handler.characters(xmlText.toCharArray(), 0, xmlText.length());
>+ xhtml.characters(xmlText.toCharArray(), 0, xmlText.length());
> }
>- handler.endDocument();
>+ xhtml.endDocument();
> }
>
> }
>
>Modified:
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java
>URL:
>http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache
>/tika/parser/ParserDecoratorTest.java?rev=1658449&r1=1658448&r2=1658449&vi
>ew=diff
>==========================================================================
>====
>---
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java (original)
>+++
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java Mon Feb 9 16:25:09 2015
>@@ -18,12 +18,16 @@ package org.apache.tika.parser;
>
> import static org.junit.Assert.assertEquals;
>
>+import java.io.ByteArrayInputStream;
>+import java.util.Arrays;
> import java.util.Collections;
> import java.util.HashMap;
> import java.util.HashSet;
> import java.util.Set;
>
>+import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
>+import org.apache.tika.sax.BodyContentHandler;
> import org.junit.Test;
>
> public class ParserDecoratorTest {
>@@ -71,4 +75,46 @@ public class ParserDecoratorTest {
> assertEquals(1, types.size());
> assertEquals(types.toString(), true,
>types.contains(MediaType.OCTET_STREAM));
> }
>+
>+ /**
>+ * Testing one proposed implementation for TIKA-1509
>+ */
>+ @Test
>+ public void withFallback() throws Exception {
>+ Set<MediaType> onlyOct =
>Collections.singleton(MediaType.OCTET_STREAM);
>+ Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
>+ MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
>+
>+ ParseContext context = new ParseContext();
>+ BodyContentHandler handler;
>+ Metadata metadata;
>+
>+ ErrorParser pFail = new ErrorParser();
>+ DummyParser pWork = new DummyParser(onlyOct, new
>HashMap<String,String>(), "Fell back!");
>+ EmptyParser pNothing = new EmptyParser();
>+
>+ // Create a combination which will fail first
>+ @SuppressWarnings("deprecation")
>+ Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail,
>pWork), octAndText);
>+
>+ // Will claim to support the types given, not those on the child
>parsers
>+ Set<MediaType> types = p.getSupportedTypes(context);
>+ assertEquals(2, types.size());
>+ assertEquals(types.toString(), true,
>types.contains(MediaType.TEXT_PLAIN));
>+ assertEquals(types.toString(), true,
>types.contains(MediaType.OCTET_STREAM));
>+
>+ // Parsing will make it to the second one
>+ metadata = new Metadata();
>+ handler = new BodyContentHandler();
>+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}),
>handler, metadata, context);
>+ assertEquals("Fell back!", handler.toString());
>+
>+
>+ // With a parser that will work with no output, will get nothing
>+ p = ParserDecorator.withFallbacks(Arrays.asList(pNothing,
>pWork), octAndText);
>+ metadata = new Metadata();
>+ handler = new BodyContentHandler();
>+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}),
>handler, metadata, context);
>+ assertEquals("", handler.toString());
>+ }
> }
>
>