You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/14 07:07:59 UTC

[tika] branch multiple-parsers updated (348bfb9 -> 6a39214)

This is an automated email from the ASF dual-hosted git repository.

nick pushed a change to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 348bfb9  More metadata handling between parsers, start on unit testing
     new 819898f  Start on a multiple parser that would try several text encodings, pick the best and use that, to ensure it would be possible
     new 62b02b0  Give parserCompleted the ParseContext, use that to pass around for the pick-best-text case what charsets to try next and what text we got from them
     new 6a39214  Some (currently failing) Supplemental Parser tests

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../parser/multiple/AbstractMultipleParser.java    |  16 ++-
 .../tika/parser/multiple/FallbackParser.java       |   3 +-
 .../multiple/PickBestTextEncodingParser.java       | 158 +++++++++++++++++++++
 .../tika/parser/multiple/SupplementingParser.java  |   3 +-
 .../tika/parser/multiple/MultipleParserTest.java   |  83 ++++++++++-
 5 files changed, 255 insertions(+), 8 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 01/03: Start on a multiple parser that would try several text encodings, pick the best and use that, to ensure it would be possible

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 819898fbde33384844ebc6b2caa4e6c6986463cf
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 06:28:12 2018 +0000

    Start on a multiple parser that would try several text encodings, pick the best and use that, to ensure it would be possible
---
 .../parser/multiple/AbstractMultipleParser.java    |  10 ++
 .../multiple/PickBestTextEncodingParser.java       | 109 +++++++++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 0aded0c..458697b 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -160,6 +160,13 @@ public abstract class AbstractMultipleParser extends AbstractParser {
     }
     
     /**
+     * Used to allow implementations to prepare or change things
+     *  before parsing occurs
+     */
+    protected void parserPrepare(Parser parser, Metadata metadata,
+                                 ParseContext context) {}
+
+    /**
      * Used to notify implementations that a Parser has Finished
      *  or Failed, and to allow them to decide to continue or 
      *  abort further parsing
@@ -208,6 +215,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // Prepare an near-empty Metadata, will merge after
                 metadata = cloneMetadata(originalMetadata);
                 
+                // Notify the implementation of what we're about to do
+                parserPrepare(p, metadata, context);
+
                 // Process if possible
                 Exception failure = null;
                 try {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
new file mode 100644
index 0000000..80e41ae
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Map;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.NonDetectingEncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Inspired by TIKA-1443 and https://wiki.apache.org/tika/CompositeParserDiscussion
+ *  this tries several different text encodings, then does the real
+ *  text parsing based on which is "best".
+ *  
+ * The logic for "best" needs a lot of work!
+ * 
+ * This is not recommended for actual production use... It is mostly to
+ *  prove that the {@link AbstractMultipleParser} environment is
+ *  sufficient to support this use-case
+ *
+ * @deprecated Currently not suitable for real use, more a demo / prototype!
+ */
+public class PickBestTextEncodingParser extends AbstractMultipleParser {
+    /**
+     * Serial version UID.
+     */
+    private static final long serialVersionUID = 730345169223211807L;
+    
+    /**
+     * Which charsets we should try
+     */
+    private String[] charsetsToTry;
+    
+    /**
+     * What charset we felt was best
+     * TODO Does this need to be thread-safe?
+     */
+    private String pickedCharset;
+    /**
+     * What text we got for each charset, so we can test for the best
+     * TODO Does this need to be thread-safe?
+     */
+    private Map<String,String> charsetText;
+
+    public PickBestTextEncodingParser(MediaTypeRegistry registry, String[] charsets) {
+        // TODO Actually give 1 more TXTParser than we have charsets
+        super(registry, MetadataPolicy.DISCARD_ALL, (Parser)null);
+        this.charsetsToTry = charsets;
+    }
+
+    @Override
+    protected void parserPrepare(Parser parser, Metadata metadata,
+            ParseContext context) {
+        super.parserPrepare(parser, metadata, context);
+        
+        // Specify which charset to try
+        // TODO How to get the next one to try?
+        Charset charset = Charset.forName(charsetsToTry[0]);
+        context.set(EncodingDetector.class, 
+                    new NonDetectingEncodingDetector(charset));
+    }
+
+    @Override
+    protected boolean parserCompleted(Parser parser, Metadata metadata,
+            ContentHandler handler, Exception exception) {
+        // TODO How to get the current charset?
+        // TODO Record the text
+        // TODO If this was the last real charset, see which one is best
+        
+        // Always have the next parser tried
+        return true;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata originalMetadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // TODO Create our own ContentHandlerFactory
+        // This will give a BodyContentHandler for each of the charset
+        //  tests, then their real ContentHandler for the last one
+        
+        // TODO Have the parsing done with our ContentHandlerFactory instead
+        super.parse(stream, handler, originalMetadata, context);
+    }
+}

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 03/03: Some (currently failing) Supplemental Parser tests

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6a39214cc8303d393d0c5c288a973398d25a94c3
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 07:01:36 2018 +0000

    Some (currently failing) Supplemental Parser tests
---
 .../tika/parser/multiple/MultipleParserTest.java   | 83 +++++++++++++++++++++-
 1 file changed, 80 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index b3166eb..9de0a99 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -23,6 +23,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.tika.parser.DummyParser;
@@ -44,6 +45,11 @@ public class MultipleParserTest {
     @Test
     public void testMimeTypeSupported() {
         // TODO
+        // Some media types
+        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+        Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+                MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+        // TODO One with a subtype
     }
     
     /**
@@ -59,8 +65,6 @@ public class MultipleParserTest {
         
         // Some media types
         Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
-        Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
-                MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
         
         // Some parsers
         ErrorParser pFail = new ErrorParser();
@@ -106,6 +110,79 @@ public class MultipleParserTest {
      */
     @Test
     public void testSupplemental() throws Exception {
-        // TODO 
+        ParseContext context = new ParseContext();
+        BodyContentHandler handler;
+        Metadata metadata;
+        Parser p;
+        String[] usedParsers;
+        
+        // Some media types
+        Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+        
+        // Some test metadata
+        Map<String,String> m1 = new HashMap<>();
+        m1.put("T1","Test1");
+        m1.put("TBoth","Test1");
+        Map<String,String> m2 = new HashMap<>();
+        m2.put("T2","Test2");
+        m2.put("TBoth","Test2");
+        
+        // Some parsers
+        ErrorParser pFail = new ErrorParser();
+        DummyParser pContent1 = new DummyParser(onlyOct, m1, "Fell back 1!");
+        DummyParser pContent2 = new DummyParser(onlyOct, m2, "Fell back 2!");
+        EmptyParser pNothing = new EmptyParser();
+        
+        
+        // With only one parser defined, works as normal
+        p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent1);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back 1!", handler.toString());
+        
+        assertEquals("Test1", metadata.get("T1"));
+        assertEquals("Test1", metadata.get("TBoth"));
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(1, usedParsers.length);
+        assertEquals(DummyParser.class.getName(), usedParsers[0]);
+        
+        
+        // Check the First, Last and All policies
+        p = new FallbackParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2);
+
+        metadata = new Metadata();
+        handler = new BodyContentHandler();
+        p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+        assertEquals("Fell back 1!", handler.toString());
+        
+        assertEquals("Test1", metadata.get("T1"));
+        assertEquals("Test1", metadata.get("TBoth"));
+       
+        usedParsers = metadata.getValues("X-Parsed-By");
+        assertEquals(3, usedParsers.length);
+        assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+        assertEquals(DummyParser.class.getName(), usedParsers[1]);
+        assertEquals(DummyParser.class.getName(), usedParsers[2]);
+        
+        // TODO Other policies
+
+        
+        // Check the Discard policy
+        // First with the last parser being a "real" one
+        // TODO
+        
+        // Then with the last parser being one that emits no metadata
+        // TODO
+        
+        
+        // Check the error details always come through, no matter the policy
+        // TODO
+        
+        
+        // Check that each parser gets its own ContentHandler if a factory was given
+        // TODO
     }
 }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.

[tika] 02/03: Give parserCompleted the ParseContext, use that to pass around for the pick-best-text case what charsets to try next and what text we got from them

Posted by ni...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 62b02b0af4bb9260dc9417b5537144a0744fa55a
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 06:42:30 2018 +0000

    Give parserCompleted the ParseContext, use that to pass around for the pick-best-text case what charsets to try next and what text we got from them
---
 .../parser/multiple/AbstractMultipleParser.java    |  6 +-
 .../tika/parser/multiple/FallbackParser.java       |  3 +-
 .../multiple/PickBestTextEncodingParser.java       | 87 +++++++++++++++++-----
 .../tika/parser/multiple/SupplementingParser.java  |  3 +-
 4 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 458697b..f99ad1a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -172,8 +172,8 @@ public abstract class AbstractMultipleParser extends AbstractParser {
      *  abort further parsing
      */
     protected abstract boolean parserCompleted(
-            Parser parser, Metadata metadata, 
-            ContentHandler handler, Exception exception);
+            Parser parser, Metadata metadata, ContentHandler handler, 
+            ParseContext context, Exception exception);
     
     /**
      * Processes the given Stream through one or more parsers, 
@@ -228,7 +228,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 }
                 
                 // Notify the implementation how it went
-                boolean tryNext = parserCompleted(p, metadata, handler, failure);
+                boolean tryNext = parserCompleted(p, metadata, handler, context, failure);
                 
                 // Handle metadata merging / clashes
                 metadata = mergeMetadata(metadata, lastMetadata, policy);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
index 97a8aaf..dc7659f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
@@ -21,6 +21,7 @@ import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.xml.sax.ContentHandler;
 
@@ -57,7 +58,7 @@ public class FallbackParser extends AbstractMultipleParser {
 
     @Override
     protected boolean parserCompleted(Parser parser, Metadata metadata,
-            ContentHandler handler, Exception exception) {
+            ContentHandler handler, ParseContext context, Exception exception) {
         // If there was no exception, abort further parsers
         if (exception == null) return false;
         
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
index 80e41ae..70f8d0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
@@ -19,6 +19,9 @@ package org.apache.tika.parser.multiple;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.tika.detect.EncodingDetector;
@@ -26,6 +29,7 @@ import org.apache.tika.detect.NonDetectingEncodingDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.xml.sax.ContentHandler;
@@ -55,22 +59,20 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser {
      */
     private String[] charsetsToTry;
     
-    /**
-     * What charset we felt was best
-     * TODO Does this need to be thread-safe?
-     */
-    private String pickedCharset;
-    /**
-     * What text we got for each charset, so we can test for the best
-     * TODO Does this need to be thread-safe?
-     */
-    private Map<String,String> charsetText;
-
     public PickBestTextEncodingParser(MediaTypeRegistry registry, String[] charsets) {
         // TODO Actually give 1 more TXTParser than we have charsets
-        super(registry, MetadataPolicy.DISCARD_ALL, (Parser)null);
+        super(registry, MetadataPolicy.DISCARD_ALL, makeParsers(charsets));
         this.charsetsToTry = charsets;
     }
+    private static List<Parser> makeParsers(String[] charsets) {
+        // One more TXTParser than we have charsets, for the real thing
+        List<Parser> parsers = new ArrayList<>(charsets.length+1);
+        for (int i=0; i<charsets.length+1; i++) {
+            // TODO Actually get the right parser, TXTParser
+            parsers.set(i, new EmptyParser());
+        }
+        return parsers;
+    }
 
     @Override
     protected void parserPrepare(Parser parser, Metadata metadata,
@@ -78,18 +80,29 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser {
         super.parserPrepare(parser, metadata, context);
         
         // Specify which charset to try
-        // TODO How to get the next one to try?
-        Charset charset = Charset.forName(charsetsToTry[0]);
+        String charset = context.get(CharsetTester.class).getNextCharset();
+        Charset charsetCS = Charset.forName(charset);
         context.set(EncodingDetector.class, 
-                    new NonDetectingEncodingDetector(charset));
+                    new NonDetectingEncodingDetector(charsetCS));
     }
 
     @Override
     protected boolean parserCompleted(Parser parser, Metadata metadata,
-            ContentHandler handler, Exception exception) {
-        // TODO How to get the current charset?
-        // TODO Record the text
-        // TODO If this was the last real charset, see which one is best
+            ContentHandler handler, ParseContext context, Exception exception) {
+        // Get the current charset
+        CharsetTester charsetTester = context.get(CharsetTester.class); 
+        String charset = charsetTester.getCurrentCharset();
+        
+        // Record the text
+        if (charsetTester.stillTesting()) {
+            charsetTester.charsetText.put(charset, handler.toString());
+            
+            // If this was the last real charset, see which one is best
+            if (! charsetTester.moreToTest()) {
+                // TODO Properly work out the best!
+                charsetTester.pickedCharset = charsetsToTry[0];
+            }
+        }
         
         // Always have the next parser tried
         return true;
@@ -103,7 +116,43 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser {
         // This will give a BodyContentHandler for each of the charset
         //  tests, then their real ContentHandler for the last one
         
+        // Put something on the ParseContext to get the charset
+        context.set(CharsetTester.class, new CharsetTester());
+        
         // TODO Have the parsing done with our ContentHandlerFactory instead
         super.parse(stream, handler, originalMetadata, context);
     }
+    
+    protected class CharsetTester {
+        /**
+         * Our current charset's index
+         */
+        private int index = -1;
+        
+        /**
+         * What charset we felt was best
+         */
+        private String pickedCharset;
+        /**
+         * What text we got for each charset, so we can test for the best
+         */
+        private Map<String,String> charsetText = new HashMap<>();
+        
+        protected String getNextCharset() {
+            index++;
+            return getCurrentCharset();
+        }
+        protected String getCurrentCharset() {
+            if (index < charsetsToTry.length) {
+                return charsetsToTry[index];
+            }
+            return pickedCharset;
+        }
+        protected boolean stillTesting() {
+            return index < charsetsToTry.length;
+        }
+        protected boolean moreToTest() {
+            return index < charsetsToTry.length-1;
+        }
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index 7eab004..1f0e3ca 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -24,6 +24,7 @@ import java.util.List;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
 import org.xml.sax.ContentHandler;
@@ -72,7 +73,7 @@ public class SupplementingParser extends AbstractMultipleParser {
 
     @Override
     protected boolean parserCompleted(Parser parser, Metadata metadata,
-            ContentHandler handler, Exception exception) {
+            ContentHandler handler, ParseContext context, Exception exception) {
         // If there was no exception, just carry on to the next
         if (exception == null) return true;
         

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.