You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/10/25 21:29:32 UTC

any23 git commit: ANY23-411 fix encoding detector

Repository: any23
Updated Branches:
  refs/heads/master 6f1266a9a -> 0aa3d54c4


ANY23-411 fix encoding detector


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0aa3d54c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0aa3d54c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0aa3d54c

Branch: refs/heads/master
Commit: 0aa3d54c41aa90d6dce5aa790f6f490c82e7c7f3
Parents: 6f1266a
Author: Hans <fi...@gmail.com>
Authored: Thu Oct 25 16:19:09 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Thu Oct 25 16:19:09 2018 -0500

----------------------------------------------------------------------
 .../apache/any23/encoding/EncodingDetector.java | 13 ++++++++
 .../extractor/SingleDocumentExtraction.java     |  2 +-
 .../any23/encoding/TikaEncodingDetector.java    | 32 +++++++++++++++++++-
 3 files changed, 45 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java b/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
index 9e4cf2b..b9de1ba 100644
--- a/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
+++ b/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
@@ -37,4 +37,17 @@ public interface EncodingDetector {
      */
     String guessEncoding(InputStream input) throws IOException;
 
+    /**
+     * Guesses the data encoding.
+     *
+     * @param input the input stream containing the data.
+     * @param contentType the declared content type of the data.
+     * @return a string compliant to
+     *         <a href="http://www.iana.org/assignments/character-sets">IANA Charset Specification</a>.
+     * @throws IOException if there is an error whilst guessing the encoding.
+     */
+    default String guessEncoding(InputStream input, String contentType) throws IOException {
+        return guessEncoding(input);
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index 77ed28c..e84ab61 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -572,7 +572,7 @@ public class SingleDocumentExtraction {
         try {
             ensureHasLocalCopy();
             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
-            String encoding = this.encoderDetector.guessEncoding(is);
+            String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
             is.close();
             return encoding;
         } catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
----------------------------------------------------------------------
diff --git a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
index 066de33..10cc34b 100644
--- a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
+++ b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
@@ -18,6 +18,7 @@
 package org.apache.any23.encoding;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
@@ -38,11 +39,18 @@ import java.util.regex.Pattern;
  *
  * @author Michele Mostarda ( michele.mostarda@gmail.com )
  * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @author Hans Brende (hansbrende@apache.org)
  * @version $Id$
  */
 public class TikaEncodingDetector implements EncodingDetector {
 
-    public String guessEncoding(InputStream is) throws IOException {
+    @Override
+    public String guessEncoding(InputStream input) throws IOException {
+        return guessEncoding(input, null);
+    }
+
+    @Override
+    public String guessEncoding(InputStream is, String contentType) throws IOException {
         if (!is.markSupported()) {
             is = new BufferedInputStream(is);
         }
@@ -54,6 +62,22 @@ public class TikaEncodingDetector implements EncodingDetector {
         Charset htmlCharset = htmlEncodingDetector.detect(is, new Metadata());
 
         CharsetDetector charsetDetector = new CharsetDetector(65536);
+
+        String incomingCharset = null;
+        if (contentType != null) {
+            MediaType mt = MediaType.parse(contentType);
+            if (mt != null) {
+                incomingCharset = mt.getParameters().get("charset");
+            }
+        }
+
+        if (incomingCharset != null) {
+            incomingCharset = CharsetUtils.clean(incomingCharset);
+            if (incomingCharset != null) {
+                charsetDetector.setDeclaredEncoding(incomingCharset);
+            }
+        }
+
         //enableInputFilter() needs to precede setText() to have any effect
         charsetDetector.enableInputFilter(true);
         charsetDetector.setText(is);
@@ -64,9 +88,15 @@ public class TikaEncodingDetector implements EncodingDetector {
             try {
                 Charset charset = CharsetUtils.forName(match.getName());
                 int confidence = match.getConfidence();
+                if (StandardCharsets.UTF_8.equals(charset)) {
+                    confidence *= 4;
+                }
                 if (charset.equals(htmlCharset) || charset.equals(xmlCharset)) {
                     confidence *= 16;
                 }
+                if (charset.name().equals(incomingCharset)) {
+                    confidence *= 16;
+                }
                 if (confidence > bestConfidence) {
                     bestCharset = charset;
                     bestConfidence = confidence;