You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/02/09 17:25:09 UTC

svn commit: r1658449 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/parser/ParserDecorator.java test/java/org/apache/tika/parser/DummyParser.java test/java/org/apache/tika/parser/ParserDecoratorTest.java

Author: nick
Date: Mon Feb  9 16:25:09 2015
New Revision: 1658449

URL: http://svn.apache.org/r1658449
Log:
TIKA-1509 Provide a possible "parser with fallback" implementation, with lots of questions!

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1658449&r1=1658448&r2=1658449&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java Mon Feb  9 16:25:09 2015
@@ -18,10 +18,12 @@ package org.apache.tika.parser;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collection;
 import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.xml.sax.ContentHandler;
@@ -81,6 +83,50 @@ public class ParserDecorator extends Abs
             }
         };
     }
+    
+    /**
+     * Decorates the given parsers into a virtual parser, where they'll
+     *  be tried in preference order until one works without error.
+     * TODO Is this the right name?
+     * TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
+     * TODO Should we reset the Metadata if we try another parser?
+     * TODO Should we reset the ContentHandler if we try another parser?
+     * TODO Should we log/report failures anywhere?
+     * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
+     */
+    public static final Parser withFallbacks(
+            final Collection<? extends Parser> parsers, final Set<MediaType> types) {
+        Parser parser = EmptyParser.INSTANCE;
+        if (!parsers.isEmpty()) parser = parsers.iterator().next();
+        
+        return new ParserDecorator(parser) {
+            private static final long serialVersionUID = 1625187131782069683L;
+            @Override
+            public Set<MediaType> getSupportedTypes(ParseContext context) {
+                return types;
+            }
+            @Override
+            public void parse(InputStream stream, ContentHandler handler,
+                    Metadata metadata, ParseContext context)
+                    throws IOException, SAXException, TikaException {
+                // Must have a TikaInputStream, so we can re-use it if parsing fails
+                TikaInputStream tstream = TikaInputStream.get(stream);
+                tstream.getFile();
+                // Try each parser in turn
+                for (Parser p : parsers) {
+                    tstream.mark(-1);
+                    try {
+                        p.parse(tstream, handler, metadata, context);
+                        return;
+                    } catch (Exception e) {
+                        // TODO How to log / record this failure?
+                    }
+                    // Prepare for the next parser, if present
+                    tstream.reset();
+                }
+            }
+        };
+    }
 
     /**
      * The decorated parser instance.

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java?rev=1658449&r1=1658448&r2=1658449&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java Mon Feb  9 16:25:09 2015
@@ -25,6 +25,7 @@ import java.util.Map.Entry;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -54,11 +55,12 @@ public class DummyParser extends Abstrac
          metadata.add(m.getKey(), m.getValue());
       }
       
-      handler.startDocument();
+      XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+      xhtml.startDocument();
       if (xmlText != null) {
-         handler.characters(xmlText.toCharArray(), 0, xmlText.length());
+          xhtml.characters(xmlText.toCharArray(), 0, xmlText.length());
       }
-      handler.endDocument();
+      xhtml.endDocument();
    }
 
 }

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java?rev=1658449&r1=1658448&r2=1658449&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java Mon Feb  9 16:25:09 2015
@@ -18,12 +18,16 @@ package org.apache.tika.parser;
 
 import static org.junit.Assert.assertEquals;
 
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;
 
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 
 public class ParserDecoratorTest {
@@ -71,4 +75,46 @@ public class ParserDecoratorTest {
         assertEquals(1, types.size());
         assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
     }
+    
+    /**
+     * Testing one proposed implementation for TIKA-1509
+     */
+    @Test
+    public void withFallback() throws Exception {
+        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+        Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+                MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+
+        ParseContext context = new ParseContext();
+        BodyContentHandler handler;
+        Metadata metadata;
+        
+        ErrorParser pFail = new ErrorParser();
+        DummyParser pWork = new DummyParser(onlyOct, new HashMap<String,String>(), "Fell back!");
+        EmptyParser pNothing = new EmptyParser();
+        
+        // Create a combination which will fail first
+        @SuppressWarnings("deprecation")
+        Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
+        
+        // Will claim to support the types given, not those on the child parsers
+        Set<MediaType> types = p.getSupportedTypes(context);
+        assertEquals(2, types.size());
+        assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
+        assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
+        
+        // Parsing will make it to the second one
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back!", handler.toString());
+        
+        
+        // With a parser that will work with no output, will get nothing
+        p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("", handler.toString());
+    }
 }



Re: svn commit: r1658449 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/parser/ParserDecorator.java test/java/org/apache/tika/parser/DummyParser.java test/java/org/apache/tika/parser/ParserDecoratorTest.java

Posted by "Mattmann, Chris A (3980)" <ch...@jpl.nasa.gov>.
You da man Nick

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Chief Architect
Instrument Software and Science Data Systems Section (398)
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 168-519, Mailstop: 168-527
Email: chris.a.mattmann@nasa.gov
WWW:  http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Associate Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++






-----Original Message-----
From: "nick@apache.org" <ni...@apache.org>
Reply-To: "dev@tika.apache.org" <de...@tika.apache.org>
Date: Monday, February 9, 2015 at 8:25 AM
To: "commits@tika.apache.org" <co...@tika.apache.org>
Subject: svn commit: r1658449 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/parser/ParserDecorator.java
test/java/org/apache/tika/parser/DummyParser.java
test/java/org/apache/tika/parser/ParserDecoratorTest.java

>Author: nick
>Date: Mon Feb  9 16:25:09 2015
>New Revision: 1658449
>
>URL: http://svn.apache.org/r1658449
>Log:
>TIKA-1509 Provide a possible "parser with fallback" implementation, with
>lots of questions!
>
>Modified:
>    
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java
>    
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
>    
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java
>
>Modified: 
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java
>URL: 
>http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache
>/tika/parser/ParserDecorator.java?rev=1658449&r1=1658448&r2=1658449&view=d
>iff
>==========================================================================
>====
>--- 
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java (original)
>+++ 
>tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.
>java Mon Feb  9 16:25:09 2015
>@@ -18,10 +18,12 @@ package org.apache.tika.parser;
> 
> import java.io.IOException;
> import java.io.InputStream;
>+import java.util.Collection;
> import java.util.HashSet;
> import java.util.Set;
> 
> import org.apache.tika.exception.TikaException;
>+import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
> import org.xml.sax.ContentHandler;
>@@ -81,6 +83,50 @@ public class ParserDecorator extends Abs
>             }
>         };
>     }
>+    
>+    /**
>+     * Decorates the given parsers into a virtual parser, where they'll
>+     *  be tried in preference order until one works without error.
>+     * TODO Is this the right name?
>+     * TODO Is this the right place to put this? Should it be in
>CompositeParser? Elsewhere?
>+     * TODO Should we reset the Metadata if we try another parser?
>+     * TODO Should we reset the ContentHandler if we try another parser?
>+     * TODO Should we log/report failures anywhere?
>+     * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
>+     */
>+    public static final Parser withFallbacks(
>+            final Collection<? extends Parser> parsers, final
>Set<MediaType> types) {
>+        Parser parser = EmptyParser.INSTANCE;
>+        if (!parsers.isEmpty()) parser = parsers.iterator().next();
>+        
>+        return new ParserDecorator(parser) {
>+            private static final long serialVersionUID =
>1625187131782069683L;
>+            @Override
>+            public Set<MediaType> getSupportedTypes(ParseContext
>context) {
>+                return types;
>+            }
>+            @Override
>+            public void parse(InputStream stream, ContentHandler handler,
>+                    Metadata metadata, ParseContext context)
>+                    throws IOException, SAXException, TikaException {
>+                // Must have a TikaInputStream, so we can re-use it if
>parsing fails
>+                TikaInputStream tstream = TikaInputStream.get(stream);
>+                tstream.getFile();
>+                // Try each parser in turn
>+                for (Parser p : parsers) {
>+                    tstream.mark(-1);
>+                    try {
>+                        p.parse(tstream, handler, metadata, context);
>+                        return;
>+                    } catch (Exception e) {
>+                        // TODO How to log / record this failure?
>+                    }
>+                    // Prepare for the next parser, if present
>+                    tstream.reset();
>+                }
>+            }
>+        };
>+    }
> 
>     /**
>      * The decorated parser instance.
>
>Modified: 
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
>URL: 
>http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache
>/tika/parser/DummyParser.java?rev=1658449&r1=1658448&r2=1658449&view=diff
>==========================================================================
>====
>--- 
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
> (original)
>+++ 
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
> Mon Feb  9 16:25:09 2015
>@@ -25,6 +25,7 @@ import java.util.Map.Entry;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
>+import org.apache.tika.sax.XHTMLContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> 
>@@ -54,11 +55,12 @@ public class DummyParser extends Abstrac
>          metadata.add(m.getKey(), m.getValue());
>       }
>       
>-      handler.startDocument();
>+      XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
>metadata);
>+      xhtml.startDocument();
>       if (xmlText != null) {
>-         handler.characters(xmlText.toCharArray(), 0, xmlText.length());
>+          xhtml.characters(xmlText.toCharArray(), 0, xmlText.length());
>       }
>-      handler.endDocument();
>+      xhtml.endDocument();
>    }
> 
> }
>
>Modified: 
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java
>URL: 
>http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache
>/tika/parser/ParserDecoratorTest.java?rev=1658449&r1=1658448&r2=1658449&vi
>ew=diff
>==========================================================================
>====
>--- 
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java (original)
>+++ 
>tika/trunk/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorT
>est.java Mon Feb  9 16:25:09 2015
>@@ -18,12 +18,16 @@ package org.apache.tika.parser;
> 
> import static org.junit.Assert.assertEquals;
> 
>+import java.io.ByteArrayInputStream;
>+import java.util.Arrays;
> import java.util.Collections;
> import java.util.HashMap;
> import java.util.HashSet;
> import java.util.Set;
> 
>+import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
>+import org.apache.tika.sax.BodyContentHandler;
> import org.junit.Test;
> 
> public class ParserDecoratorTest {
>@@ -71,4 +75,46 @@ public class ParserDecoratorTest {
>         assertEquals(1, types.size());
>         assertEquals(types.toString(), true,
>types.contains(MediaType.OCTET_STREAM));
>     }
>+    
>+    /**
>+     * Testing one proposed implementation for TIKA-1509
>+     */
>+    @Test
>+    public void withFallback() throws Exception {
>+        Set<MediaType> onlyOct =
>Collections.singleton(MediaType.OCTET_STREAM);
>+        Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
>+                MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
>+
>+        ParseContext context = new ParseContext();
>+        BodyContentHandler handler;
>+        Metadata metadata;
>+        
>+        ErrorParser pFail = new ErrorParser();
>+        DummyParser pWork = new DummyParser(onlyOct, new
>HashMap<String,String>(), "Fell back!");
>+        EmptyParser pNothing = new EmptyParser();
>+        
>+        // Create a combination which will fail first
>+        @SuppressWarnings("deprecation")
>+        Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail,
>pWork), octAndText);
>+        
>+        // Will claim to support the types given, not those on the child
>parsers
>+        Set<MediaType> types = p.getSupportedTypes(context);
>+        assertEquals(2, types.size());
>+        assertEquals(types.toString(), true,
>types.contains(MediaType.TEXT_PLAIN));
>+        assertEquals(types.toString(), true,
>types.contains(MediaType.OCTET_STREAM));
>+        
>+        // Parsing will make it to the second one
>+        metadata = new Metadata();
>+        handler = new BodyContentHandler();
>+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}),
>handler, metadata, context);
>+        assertEquals("Fell back!", handler.toString());
>+        
>+        
>+        // With a parser that will work with no output, will get nothing
>+        p = ParserDecorator.withFallbacks(Arrays.asList(pNothing,
>pWork), octAndText);
>+        metadata = new Metadata();
>+        handler = new BodyContentHandler();
>+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}),
>handler, metadata, context);
>+        assertEquals("", handler.toString());
>+    }
> }
>
>