You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/10/25 21:29:32 UTC
any23 git commit: ANY23-411 fix encoding detector
Repository: any23
Updated Branches:
refs/heads/master 6f1266a9a -> 0aa3d54c4
ANY23-411 fix encoding detector
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0aa3d54c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0aa3d54c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0aa3d54c
Branch: refs/heads/master
Commit: 0aa3d54c41aa90d6dce5aa790f6f490c82e7c7f3
Parents: 6f1266a
Author: Hans <fi...@gmail.com>
Authored: Thu Oct 25 16:19:09 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Thu Oct 25 16:19:09 2018 -0500
----------------------------------------------------------------------
.../apache/any23/encoding/EncodingDetector.java | 13 ++++++++
.../extractor/SingleDocumentExtraction.java | 2 +-
.../any23/encoding/TikaEncodingDetector.java | 32 +++++++++++++++++++-
3 files changed, 45 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
----------------------------------------------------------------------
diff --git a/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java b/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
index 9e4cf2b..b9de1ba 100644
--- a/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
+++ b/api/src/main/java/org/apache/any23/encoding/EncodingDetector.java
@@ -37,4 +37,17 @@ public interface EncodingDetector {
*/
String guessEncoding(InputStream input) throws IOException;
+ /**
+ * Guesses the data encoding.
+ *
+ * @param input the input stream containing the data.
+ * @param contentType the declared content type of the data.
+ * @return a string compliant to
+ * <a href="http://www.iana.org/assignments/character-sets">IANA Charset Specification</a>.
+ * @throws IOException if there is an error whilst guessing the encoding.
+ */
+ default String guessEncoding(InputStream input, String contentType) throws IOException {
+ return guessEncoding(input);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index 77ed28c..e84ab61 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -572,7 +572,7 @@ public class SingleDocumentExtraction {
try {
ensureHasLocalCopy();
InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
- String encoding = this.encoderDetector.guessEncoding(is);
+ String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
is.close();
return encoding;
} catch (Exception e) {
http://git-wip-us.apache.org/repos/asf/any23/blob/0aa3d54c/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
----------------------------------------------------------------------
diff --git a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
index 066de33..10cc34b 100644
--- a/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
+++ b/encoding/src/main/java/org/apache/any23/encoding/TikaEncodingDetector.java
@@ -18,6 +18,7 @@
package org.apache.any23.encoding;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
@@ -38,11 +39,18 @@ import java.util.regex.Pattern;
*
* @author Michele Mostarda ( michele.mostarda@gmail.com )
* @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @author Hans Brende (hansbrende@apache.org)
* @version $Id$
*/
public class TikaEncodingDetector implements EncodingDetector {
- public String guessEncoding(InputStream is) throws IOException {
+ @Override
+ public String guessEncoding(InputStream input) throws IOException {
+ return guessEncoding(input, null);
+ }
+
+ @Override
+ public String guessEncoding(InputStream is, String contentType) throws IOException {
if (!is.markSupported()) {
is = new BufferedInputStream(is);
}
@@ -54,6 +62,22 @@ public class TikaEncodingDetector implements EncodingDetector {
Charset htmlCharset = htmlEncodingDetector.detect(is, new Metadata());
CharsetDetector charsetDetector = new CharsetDetector(65536);
+
+ String incomingCharset = null;
+ if (contentType != null) {
+ MediaType mt = MediaType.parse(contentType);
+ if (mt != null) {
+ incomingCharset = mt.getParameters().get("charset");
+ }
+ }
+
+ if (incomingCharset != null) {
+ incomingCharset = CharsetUtils.clean(incomingCharset);
+ if (incomingCharset != null) {
+ charsetDetector.setDeclaredEncoding(incomingCharset);
+ }
+ }
+
//enableInputFilter() needs to precede setText() to have any effect
charsetDetector.enableInputFilter(true);
charsetDetector.setText(is);
@@ -64,9 +88,15 @@ public class TikaEncodingDetector implements EncodingDetector {
try {
Charset charset = CharsetUtils.forName(match.getName());
int confidence = match.getConfidence();
+ if (StandardCharsets.UTF_8.equals(charset)) {
+ confidence *= 4;
+ }
if (charset.equals(htmlCharset) || charset.equals(xmlCharset)) {
confidence *= 16;
}
+ if (charset.name().equals(incomingCharset)) {
+ confidence *= 16;
+ }
if (confidence > bestConfidence) {
bestCharset = charset;
bestConfidence = confidence;