You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/14 17:35:14 UTC

[tika] branch multiple-parsers updated (6a39214 -> 12a98b6)

This is an automated email from the ASF dual-hosted git repository.

nick pushed a change to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 6a39214  Some (currently failing) Supplemental Parser tests
     new d181c3c  Correct Metadata merging by policy, and get (incomplete) unit tests to pass
     new e82f7ce  Further unit tests
     new c989d2f  Optionally use a new Handler for each Parser, if a factory was given
     new 12a98b6  Keep all implemented and unit test

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../parser/multiple/AbstractMultipleParser.java    | 102 ++++++++++++++++-----
 .../multiple/PickBestTextEncodingParser.java       |  46 +++++++++-
 .../java/org/apache/tika/utils/ParserUtils.java    |   3 +-
 .../tika/parser/multiple/MultipleParserTest.java   |  95 ++++++++++++++++---
 4 files changed, 203 insertions(+), 43 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 01/04: Correct Metadata merging by policy, and get (incomplete) unit tests to pass

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d181c3c05a70163597dd40a9e1b140303ec2739f
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 16:53:26 2018 +0000

    Correct Metadata merging by policy, and get (incomplete) unit tests to pass
---
 .../parser/multiple/AbstractMultipleParser.java    | 37 ++++++++++++++++++----
 .../java/org/apache/tika/utils/ParserUtils.java    |  3 +-
 .../tika/parser/multiple/MultipleParserTest.java   | 17 +++++++---
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index f99ad1a..c471826 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -37,6 +37,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -194,7 +195,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             // Force the stream to be a Tika one
             // Force the stream to be file-backed, so we can re-read safely
             //  later if required for parser 2+
-            // TODO Should we use RereadableInputStream instead?
+            // TODO Should we support RereadableInputStream as well?
+            // TODO Can we put this re-read logic in a utils method?
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             taggedStream.getPath();
             
@@ -223,6 +225,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 try {
                     p.parse(taggedStream, handler, metadata, context);
                 } catch (Exception e) {
+                    // Record the failure such that it can't get lost / overwritten
+                    recordParserFailure(p, e, originalMetadata);
                     recordParserFailure(p, e, metadata);
                     failure = e;
                 }
@@ -255,7 +259,10 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         
         // Finally, copy the latest metadata back onto their supplied object
         for (String n : metadata.names()) {
-            originalMetadata.set(n, metadata.get(n));
+            originalMetadata.remove(n);
+            for (String val : metadata.getValues(n)) {
+                originalMetadata.add(n, val);
+            }
         }
     }
     
@@ -268,13 +275,31 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         }
         
         for (String n : lastMetadata.names()) {
-            if (newMetadata.get(n) == null) {
-                newMetadata.set(n, lastMetadata.get(n));
+            // If this is one of the metadata keys we're setting ourselves
+            //  for tracking/errors, then always keep the latest one!
+            if (n.equals(ParserUtils.X_PARSED_BY)) continue;
+            if (n.equals(ParserUtils.EMBEDDED_PARSER.getName())) continue;
+            if (n.equals(ParserUtils.EMBEDDED_EXCEPTION.getName())) continue;
+            
+            // Merge as per policy 
+            String[] newVals = newMetadata.getValues(n);
+            String[] oldVals = lastMetadata.getValues(n);
+            if (newVals == null) {
+                // Metadata only in previous run, keep old values
+                for (String val : oldVals) {
+                    newMetadata.add(n, val);
+                }
+            } else if (Arrays.deepEquals(oldVals, newVals)) {
+                // Metadata is the same, nothing to do
+                continue;
             } else {
                 switch (policy) {
                 case FIRST_WINS:
-                    // Use the earlier value 
-                    newMetadata.set(n, lastMetadata.get(n));
+                    // Use the earlier value(s) in place of this/these one/s
+                    newMetadata.remove(n);
+                    for (String val : oldVals) {
+                        newMetadata.add(n, val);
+                    }
                     continue;
                 case LAST_WINS:
                     // Most recent (last) parser has already won
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index c3c63ba..4005217 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -26,6 +26,7 @@ import org.apache.tika.parser.ParserDecorator;
  * Helper util methods for Parsers themselves.
  */
 public class ParserUtils {
+    public final static String X_PARSED_BY = "X-Parsed-By"; 
     public final static Property EMBEDDED_PARSER =
             Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
     public final static Property EMBEDDED_EXCEPTION =
@@ -68,7 +69,7 @@ public class ParserUtils {
      *  or used.
      */
     public static void recordParserDetails(Parser parser, Metadata metadata) {
-        metadata.add("X-Parsed-By", getParserClassname(parser));
+        metadata.add(X_PARSED_BY, getParserClassname(parser));
     }
 
     /**
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index 9de0a99..21d9724 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.multiple;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
 import java.util.Arrays;
@@ -96,7 +97,8 @@ public class MultipleParserTest {
        
         usedParsers = metadata.getValues("X-Parsed-By");
         assertEquals(2, usedParsers.length);
-        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+        assertEquals(DummyParser.class.getName(), usedParsers[1]);
         
         // TODO Check we got an exception
         
@@ -134,8 +136,15 @@ public class MultipleParserTest {
         EmptyParser pNothing = new EmptyParser();
         
         
+        // Supplemental doesn't support DISCARD
+        try {
+            new SupplementingParser(null, MetadataPolicy.DISCARD_ALL, new Parser[0]);
+            fail("Discard shouldn't be supported");
+        } catch (IllegalArgumentException e) {}
+        
+        
         // With only one parser defined, works as normal
-        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent1);
+        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1);
 
         metadata = new Metadata();
         handler = new BodyContentHandler();
@@ -151,12 +160,12 @@ public class MultipleParserTest {
         
         
         // Check the First, Last and All policies
-        p = new FallbackParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
+        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
 
         metadata = new Metadata();
         handler = new BodyContentHandler();
         p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
-        assertEquals("Fell back 1!", handler.toString());
+        assertEquals("Fell back 1!Fell back 2!", handler.toString());
         
         assertEquals("Test1", metadata.get("T1"));
         assertEquals("Test1", metadata.get("TBoth"));

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 04/04: Keep all implemented and unit test

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 12a98b63babc8515177d6f0e3df17ae8912142ee
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 17:35:01 2018 +0000

    Keep all implemented and unit test
---
 .../parser/multiple/AbstractMultipleParser.java    | 16 ++++++++++++++--
 .../tika/parser/multiple/MultipleParserTest.java   | 22 +++++++++++++++++++++-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index ece2b8d..8f896b2 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -22,6 +22,7 @@ import static org.apache.tika.utils.ParserUtils.recordParserFailure;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
@@ -326,8 +327,19 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                     // Most recent (last) parser has already won
                     continue;
                 case KEEP_ALL:
-                    // TODO Find unique values to add
-                    // TODO Implement
+                    // Start with old list, then add any new unique values
+                    List<String> vals = new ArrayList<>(Arrays.asList(oldVals));
+                    newMetadata.remove(n);
+                    for (String oldVal : oldVals) {
+                        newMetadata.add(n, oldVal);
+                    }
+                    for (String newVal : newVals) {
+                        if (! vals.contains(newVal)) {
+                            newMetadata.add(n, newVal);
+                            vals.add(newVal);
+                        }
+                    }
+                    
                     continue;
                 }
             }
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index 3d77e9d..590c95d 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -220,7 +220,27 @@ public class MultipleParserTest {
         assertEquals(EmptyParser.class.getName(), usedParsers[3]);
         
         
-        // TODO Implement then check the Merge policies
+        // Merge
+        p = new SupplementingParser(null, MetadataPolicy.KEEP_ALL, pFail, 
+                                    pContent1, pContent2, pNothing);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back 1!Fell back 2!", handler.toString());
+
+        assertEquals("Test1", metadata.get("T1"));
+        assertEquals("Test2", metadata.get("T2"));
+        assertEquals(2, metadata.getValues("TBoth").length);
+        assertEquals("Test1", metadata.getValues("TBoth")[0]);
+        assertEquals("Test2", metadata.getValues("TBoth")[1]);
+
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(4, usedParsers.length);
+        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+        assertEquals(DummyParser.class.getName(), usedParsers[1]);
+        assertEquals(DummyParser.class.getName(), usedParsers[2]);
+        assertEquals(EmptyParser.class.getName(), usedParsers[3]);
 
         
         // Check the error details always come through, no matter the policy

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 02/04: Further unit tests

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e82f7ce5743536b406fd6971016f6f048f42b3f5
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 17:15:27 2018 +0000

    Further unit tests
---
 .../parser/multiple/AbstractMultipleParser.java    |  2 +-
 .../tika/parser/multiple/MultipleParserTest.java   | 60 +++++++++++++++++-----
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index c471826..5d32fa9 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -284,7 +284,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             // Merge as per policy 
             String[] newVals = newMetadata.getValues(n);
             String[] oldVals = lastMetadata.getValues(n);
-            if (newVals == null) {
+            if (newVals == null || newVals.length == 0) {
                 // Metadata only in previous run, keep old values
                 for (String val : oldVals) {
                     newMetadata.add(n, val);
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index 21d9724..3d77e9d 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.multiple;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
@@ -36,6 +37,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ParserUtils;
 import org.junit.Test;
 
 public class MultipleParserTest {
@@ -100,11 +102,27 @@ public class MultipleParserTest {
         assertEquals(ErrorParser.class.getName(), usedParsers[0]);
         assertEquals(DummyParser.class.getName(), usedParsers[1]);
         
-        // TODO Check we got an exception
+        // Check we got an exception
+        assertNotNull(metadata.get(ParserUtils.EMBEDDED_EXCEPTION));
+        assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER));
+        assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER));
         
         
-        // Won't go past the working one
-        // TODO
+        // Won't go past a working parser to a second one, stops after one works
+        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent, pNothing);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back!", handler.toString());
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(2, usedParsers.length);
+        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+        assertEquals(DummyParser.class.getName(), usedParsers[1]);
+        
+        
+        // TODO Check merge policies - First vs Discard
     }
     
     /**
@@ -159,8 +177,10 @@ public class MultipleParserTest {
         assertEquals(DummyParser.class.getName(), usedParsers[0]);
         
         
-        // Check the First, Last and All policies
-        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
+        // Check the First, Last and All policies:
+        // First Wins
+        p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, 
+                                    pContent1, pContent2, pNothing);
 
         metadata = new Metadata();
         handler = new BodyContentHandler();
@@ -168,24 +188,40 @@ public class MultipleParserTest {
         assertEquals("Fell back 1!Fell back 2!", handler.toString());
         
         assertEquals("Test1", metadata.get("T1"));
+        assertEquals("Test2", metadata.get("T2"));
         assertEquals("Test1", metadata.get("TBoth"));
        
         usedParsers = metadata.getValues("X-Parsed-By");
-        assertEquals(3, usedParsers.length);
+        assertEquals(4, usedParsers.length);
         assertEquals(ErrorParser.class.getName(), usedParsers[0]);
         assertEquals(DummyParser.class.getName(), usedParsers[1]);
         assertEquals(DummyParser.class.getName(), usedParsers[2]);
+        assertEquals(EmptyParser.class.getName(), usedParsers[3]);
+        
         
-        // TODO Other policies
+        // Last Wins
+        p = new SupplementingParser(null, MetadataPolicy.LAST_WINS, pFail, 
+                                    pContent1, pContent2, pNothing);
 
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back 1!Fell back 2!", handler.toString());
         
-        // Check the Discard policy
-        // First with the last parser being a "real" one
-        // TODO
+        assertEquals("Test1", metadata.get("T1"));
+        assertEquals("Test2", metadata.get("T2"));
+        assertEquals("Test2", metadata.get("TBoth"));
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(4, usedParsers.length);
+        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+        assertEquals(DummyParser.class.getName(), usedParsers[1]);
+        assertEquals(DummyParser.class.getName(), usedParsers[2]);
+        assertEquals(EmptyParser.class.getName(), usedParsers[3]);
         
-        // Then with the last parser being one that emits no metadata
-        // TODO
         
+        // TODO Implement then check the Merge policies
+
         
         // Check the error details always come through, no matter the policy
         // TODO

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 03/04: Optionally use a new Handler for each Parser, if a factory was given

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c989d2f6aedcdf5375719ea1abe6ea136fc0e92c
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 17:28:07 2018 +0000

    Optionally use a new Handler for each Parser, if a factory was given
---
 .../parser/multiple/AbstractMultipleParser.java    | 49 +++++++++++++++-------
 .../multiple/PickBestTextEncodingParser.java       | 46 +++++++++++++++++---
 2 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 5d32fa9..ece2b8d 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -37,6 +37,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -47,6 +48,8 @@ import org.xml.sax.SAXException;
  *  of the various parsers used.
  * End users should normally use {@link FallbackParser} or
  *  {@link SupplementingParser} along with a Strategy.
+ * Note that unless you give a {@link ContentHandlerFactory},
+ *  you'll get content from every parser tried mushed together!
  *
  * @since Apache Tika 1.18
  */
@@ -85,11 +88,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         KEEP_ALL
     };
     
-    // TODO Figure out some sort of Content Policy and how
-    //  it might possibly work
-    // TODO Is an overridden method that takes a 
-    //  ContentHandlerFactory the best way?
-
     /**
      * Media type registry.
      */
@@ -179,10 +177,34 @@ public abstract class AbstractMultipleParser extends AbstractParser {
     /**
      * Processes the given Stream through one or more parsers, 
      *  resetting things between parsers as requested by policy.
-     * The actual processing is delegated to one or more {@link Parser}s
+     * The actual processing is delegated to one or more {@link Parser}s.
+     * 
+     * Note that you'll get text from every parser this way, to have 
+     *  control of which content is from which parser you need to
+     *  call the method with a {@link ContentHandlerFactory} instead. 
      */
+    @Override
     public void parse(
             InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, null, metadata, context);
+    }
+    /**
+     * Processes the given Stream through one or more parsers, 
+     *  resetting things between parsers as requested by policy.
+     * The actual processing is delegated to one or more {@link Parser}s.
+     * You will get one ContentHandler fetched for each Parser used.
+     * TODO Do we need to return all the ContentHandler instances we created?
+     */
+    public void parse(
+            InputStream stream, ContentHandlerFactory handlers,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        parse(stream, null, handlers, metadata, context);
+    }
+    private void parse(InputStream stream, 
+            ContentHandler handler, ContentHandlerFactory handlerFactory,
             Metadata originalMetadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         // Track the metadata between parsers, so we can apply our policy
@@ -200,17 +222,19 @@ public abstract class AbstractMultipleParser extends AbstractParser {
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             taggedStream.getPath();
             
-            // TODO Somehow shield/wrap the Handler, so that we can
-            //  avoid failures if multiple parsers want to do content
-            // TODO Solve the multiple-content problem!
-            // TODO Provide a way to supply a ContentHandlerFactory?
-
             for (Parser p : parsers) {
                 // Indicate we may need to re-read the stream later
                 // TODO Support an InputStreamFactory as an alternative to
                 //  Files, see TIKA-2585
                 taggedStream.mark(-1);
                 
+                // Get a new handler for this parser, if we can
+                // If not, the user will get text from every parser
+                //  mushed together onto the one solitary handler...
+                if (handlerFactory != null) {
+                    handler = handlerFactory.getNewContentHandler();
+                }
+                
                 // Record that we used this parser
                 recordParserDetails(p, originalMetadata);
 
@@ -266,9 +290,6 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         }
     }
     
-    // TODO Provide a method that takes an InputStreamSource as well,
-    //  and a ContentHandlerFactory. Will need wrappers to convert standard
-    
     protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) {
         if (policy == MetadataPolicy.DISCARD_ALL) {
             return newMetadata;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
index 70f8d0a..6cc81af 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
@@ -18,6 +18,8 @@ package org.apache.tika.parser.multiple;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -32,6 +34,9 @@ import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.txt.TXTParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -45,6 +50,8 @@ import org.xml.sax.SAXException;
  * This is not recommended for actual production use... It is mostly to
  *  prove that the {@link AbstractMultipleParser} environment is
  *  sufficient to support this use-case
+ *  
+ * TODO Move this to the parsers package so it can get {@link TXTParser}
  *
  * @deprecated Currently not suitable for real use, more a demo / prototype!
  */
@@ -112,17 +119,46 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser {
     public void parse(InputStream stream, ContentHandler handler,
             Metadata originalMetadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        // TODO Create our own ContentHandlerFactory
-        // This will give a BodyContentHandler for each of the charset
-        //  tests, then their real ContentHandler for the last one
+        // Use a BodyContentHandler for each of the charset test,
+        //  then their real ContentHandler for the last one
+        CharsetContentHandlerFactory handlerFactory = new CharsetContentHandlerFactory();
+        handlerFactory.handler = handler;
         
         // Put something on the ParseContext to get the charset
         context.set(CharsetTester.class, new CharsetTester());
         
-        // TODO Have the parsing done with our ContentHandlerFactory instead
-        super.parse(stream, handler, originalMetadata, context);
+        // Have the parsing done
+        super.parse(stream, handlerFactory, originalMetadata, context);
+    }
+    @Override
+    public void parse(InputStream stream, ContentHandlerFactory handlers,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // We only work with one ContentHandler as far as the user is
+        //  concerned, any others are purely internal!
+        parse(stream, handlers.getNewContentHandler(), metadata, context);
     }
     
+    protected class CharsetContentHandlerFactory implements ContentHandlerFactory {
+        // Which one we're on
+        private int index = -1;
+        // The real one for at the end
+        private ContentHandler handler;
+        
+        @Override
+        public ContentHandler getNewContentHandler() {
+            index++;
+            if (index < charsetsToTry.length)
+                return new BodyContentHandler();
+            return handler;
+        }
+        @Override
+        public ContentHandler getNewContentHandler(OutputStream os,
+                String encoding) throws UnsupportedEncodingException {
+            return getNewContentHandler();
+        }
+    }
+
     protected class CharsetTester {
         /**
          * Our current charset's index

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.