You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/14 07:08:00 UTC

[tika] 01/03: Start on a multiple parser that would try several text encodings, pick the best and use that, to ensure it would be possible

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 819898fbde33384844ebc6b2caa4e6c6986463cf
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 14 06:28:12 2018 +0000

    Start on a multiple parser that would try several text encodings, pick the best and use that, to ensure it would be possible
---
 .../parser/multiple/AbstractMultipleParser.java    |  10 ++
 .../multiple/PickBestTextEncodingParser.java       | 109 +++++++++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 0aded0c..458697b 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -160,6 +160,13 @@ public abstract class AbstractMultipleParser extends AbstractParser {
     }
     
     /**
+     * Used to allow implementations to prepare or change things
+     *  before parsing occurs
+     */
+    protected void parserPrepare(Parser parser, Metadata metadata,
+                                 ParseContext context) {}
+
+    /**
      * Used to notify implementations that a Parser has Finished
      *  or Failed, and to allow them to decide to continue or 
      *  abort further parsing
@@ -208,6 +215,9 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 // Prepare an near-empty Metadata, will merge after
                 metadata = cloneMetadata(originalMetadata);
                 
+                // Notify the implementation of what we're about to do
+                parserPrepare(p, metadata, context);
+
                 // Process if possible
                 Exception failure = null;
                 try {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
new file mode 100644
index 0000000..80e41ae
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Map;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.NonDetectingEncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Inspired by TIKA-1443 and https://wiki.apache.org/tika/CompositeParserDiscussion
+ *  this tries several different text encodings, then does the real
+ *  text parsing based on which is "best".
+ *  
+ * The logic for "best" needs a lot of work!
+ * 
+ * This is not recommended for actual production use... It is mostly to
+ *  prove that the {@link AbstractMultipleParser} environment is
+ *  sufficient to support this use-case
+ *
+ * @deprecated Currently not suitable for real use, more a demo / prototype!
+ */
+public class PickBestTextEncodingParser extends AbstractMultipleParser {
+    /**
+     * Serial version UID.
+     */
+    private static final long serialVersionUID = 730345169223211807L;
+    
+    /**
+     * Which charsets we should try
+     */
+    private String[] charsetsToTry;
+    
+    /**
+     * What charset we felt was best
+     * TODO Does this need to be thread-safe?
+     */
+    private String pickedCharset;
+    /**
+     * What text we got for each charset, so we can test for the best
+     * TODO Does this need to be thread-safe?
+     */
+    private Map<String,String> charsetText;
+
+    public PickBestTextEncodingParser(MediaTypeRegistry registry, String[] charsets) {
+        // TODO Actually give 1 more TXTParser than we have charsets
+        super(registry, MetadataPolicy.DISCARD_ALL, (Parser)null);
+        this.charsetsToTry = charsets;
+    }
+
+    @Override
+    protected void parserPrepare(Parser parser, Metadata metadata,
+            ParseContext context) {
+        super.parserPrepare(parser, metadata, context);
+        
+        // Specify which charset to try
+        // TODO How to get the next one to try?
+        Charset charset = Charset.forName(charsetsToTry[0]);
+        context.set(EncodingDetector.class, 
+                    new NonDetectingEncodingDetector(charset));
+    }
+
+    @Override
+    protected boolean parserCompleted(Parser parser, Metadata metadata,
+            ContentHandler handler, Exception exception) {
+        // TODO How to get the current charset?
+        // TODO Record the text
+        // TODO If this was the last real charset, see which one is best
+        
+        // Always have the next parser tried
+        return true;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata originalMetadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // TODO Create our own ContentHandlerFactory
+        // This will give a BodyContentHandler for each of the charset
+        //  tests, then their real ContentHandler for the last one
+        
+        // TODO Have the parsing done with our ContentHandlerFactory instead
+        super.parse(stream, handler, originalMetadata, context);
+    }
+}

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.