You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/27 00:41:19 UTC
[3/3] tika git commit: TIKA-2041,
upgrade ICU4j's charset detector to avoid multithreading bug.
TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9f6c71fa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9f6c71fa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9f6c71fa
Branch: refs/heads/2.x
Commit: 9f6c71fa69eaae558aff85cfa0dce72bca08fd4e
Parents: f89887d
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 20:41:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 20:41:10 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../apache/tika/parser/txt/CharsetDetector.java | 437 ++++-----
.../apache/tika/parser/txt/CharsetMatch.java | 170 ++--
.../tika/parser/txt/CharsetRecog_2022.java | 28 +-
.../tika/parser/txt/CharsetRecog_UTF8.java | 24 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 99 +-
.../tika/parser/txt/CharsetRecog_mbcs.java | 44 +-
.../tika/parser/txt/CharsetRecog_sbcs.java | 903 +++++++++----------
.../tika/parser/txt/CharsetRecognizer.java | 31 +-
.../tika/parser/txt/Icu4jEncodingDetector.java | 8 +
.../apache/tika/parser/html/HtmlParserTest.java | 112 +++
11 files changed, 964 insertions(+), 895 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e5b5050..abfbdec 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
Release 1.14 - ???
+ * Upgrade ICU4J charset detection components to fix multithreading
+ bug (TIKA-2041).
+
* Upgrade to Jackcess 2.1.4 (TIKA-2039).
* Maintain more significant digits in cells of "General" format
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index f9df9e0..1ee7f28 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -9,30 +11,37 @@ package org.apache.tika.parser.txt;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
-import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.List;
/**
+ * NOTE: This was copied from ICU4J with two modifications:
+ * Apache Tika added the EBCDIC-500 family of detectors, and
+ * we increased the buffer to 12000 bytes.
+ *
+ * <p>
+ *
* <code>CharsetDetector</code> provides a facility for detecting the
* charset or encoding of character data in an unknown format.
* The input data can either be from an input stream or an array of bytes.
* The result of the detection operation is a list of possibly matching
* charsets, or, for simple use, you can just ask for a Java Reader that
* will will work over the input data.
- * <p/>
+ * <p>
* Character set detection is at best an imprecise operation. The detection
* process will attempt to identify the charset that best matches the characteristics
* of the byte data, but the process is partly statistical in nature, and
* the results can not be guaranteed to always be correct.
- * <p/>
+ * <p>
* For best accuracy in charset detection, the input data should be primarily
* in a single language, and a minimum of a few hundred bytes worth of plain text
* in the language are needed. The detection process will attempt to
* ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
+ * <p>
+ *
* @stable ICU 3.4
*/
public class CharsetDetector {
@@ -47,13 +56,58 @@ public class CharsetDetector {
// actually choose the "real" charset. All assuming that the application just
// wants the data, and doesn't care about a char set name.
- private static final int kBufSize = 12000;
- private static final int MAX_CONFIDENCE = 100;
- private static String[] fCharsetNames;
+ private static final int kBufSize = 12000;//legacy value; more recent value is 8000
/*
* List of recognizers for all charsets known to the implementation.
*/
- private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+ private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
+
+ static {
+ List<CSRecognizerInfo> list = new ArrayList<>();
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true));
+
+ // IBM 420/424 recognizers are disabled by default
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
+
+ ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
+ }
+
/*
* The following items are accessed by individual CharsetRecongizers during
* the recognition process
@@ -61,26 +115,27 @@ public class CharsetDetector {
*/
byte[] fInputBytes = // The text to be checked. Markup will have been
new byte[kBufSize]; // removed if appropriate.
- int fInputLen; // Length of the byte data in fInputText.
+ int fInputLen; // Length of the byte data in fInputBytes.
short fByteStats[] = // byte frequency statistics for the input text.
new short[256]; // Value is percent, not absolute.
boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
false;
String fDeclaredEncoding;
- //
- // Stuff private to CharsetDetector
- //
byte[] fRawInput; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
// If user gave us a stream, it's read to a
// buffer here.
int fRawLength; // Length of data in fRawInput array.
InputStream fInputStream; // User's input stream, or null if the user
- boolean fStripTags = // If true, setText() will strip tags from input text.
+ //
+ // Stuff private to CharsetDetector
+ //
+ private boolean fStripTags = // If true, setText() will strip tags from input text.
false;
+ private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had
/**
- * Constructor
+ * Constructor
*
* @stable ICU 3.4
*/
@@ -88,149 +143,73 @@ public class CharsetDetector {
}
/**
- * Get the names of all char sets that can be recognized by the char set detector.
- *
- * @return an array of the names of all charsets that can be recognized
- * by the charset detector.
+ * Get the names of all charsets supported by <code>CharsetDetector</code> class.
+ * <p>
+ * <b>Note:</b> Multiple different charset encodings in a same family may use
+ * a single shared name in this implementation. For example, this method returns
+ * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+ * (Windows Latin 1). However, actual detection result could be "windows-1252"
+ * when the input data matches Latin 1 code points with any points only available
+ * in "windows-1252".
*
+ * @return an array of the names of all charsets supported by
+ * <code>CharsetDetector</code> class.
* @stable ICU 3.4
*/
public static String[] getAllDetectableCharsets() {
- return fCharsetNames;
- }
-
- /*
- * Create the singleton instances of the CharsetRecognizer classes
- */
- private static ArrayList<CharsetRecognizer> createRecognizers() {
- ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
- recognizers.add(new CharsetRecog_UTF8());
-
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
- // Create an array of all charset names, as a side effect.
- // Needed for the getAllDetectableCharsets() API.
- String[] charsetNames = new String[recognizers.size()];
- int out = 0;
-
- for (CharsetRecognizer recognizer : recognizers) {
- String name = recognizer.getName();
-
- if (out == 0 || !name.equals(charsetNames[out - 1])) {
- charsetNames[out++] = name;
- }
+ String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
+ for (int i = 0; i < allCharsetNames.length; i++) {
+ allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
}
-
- fCharsetNames = new String[out];
- System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
- return recognizers;
+ return allCharsetNames;
}
/**
* Set the declared encoding for charset detection.
- * The declared encoding of an input text is an encoding obtained
- * from an http header or xml declaration or similar source that
- * can be provided as additional information to the charset detector.
- * A match between a declared encoding and a possible detected encoding
- * will raise the quality of that detected encoding by a small delta,
- * and will also appear as a "reason" for the match.
- * <p/>
+ * The declared encoding of an input text is an encoding obtained
+ * from an http header or xml declaration or similar source that
+ * can be provided as additional information to the charset detector.
+ * A match between a declared encoding and a possible detected encoding
+ * will raise the quality of that detected encoding by a small delta,
+ * and will also appear as a "reason" for the match.
+ * <p>
* A declared encoding that is incompatible with the input data being
* analyzed will not be added to the list of possible encodings.
*
- * @param encoding The declared encoding
- *
+ * @param encoding The declared encoding
* @stable ICU 3.4
*/
public CharsetDetector setDeclaredEncoding(String encoding) {
- setCanonicalDeclaredEncoding(encoding);
+ fDeclaredEncoding = encoding;
return this;
}
+ // Value is rounded up, so zero really means zero occurences.
/**
* Set the input text (byte) data whose charset is to be detected.
*
* @param in the input text of unknown encoding
- *
* @return This CharsetDetector
- *
* @stable ICU 3.4
*/
public CharsetDetector setText(byte[] in) {
fRawInput = in;
fRawLength = in.length;
- MungeInput();
-
return this;
}
- // Value is rounded up, so zero really means zero occurences.
/**
* Set the input text (byte) data whose charset is to be detected.
- * <p/>
- * The input stream that supplies the character data must have markSupported()
- * == true; the charset detection process will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
+ * <p>
+ * The input stream that supplies the character data must have markSupported()
+ * == true; the charset detection process will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
*
* @param in the input text of unknown encoding
- *
* @return This CharsetDetector
- *
* @stable ICU 3.4
*/
@@ -259,21 +238,20 @@ public class CharsetDetector {
/**
* Return the charset that best matches the supplied input data.
- *
+ * <p>
* Note though, that because the detection
* only looks at the start of the input data,
* there is a possibility that the returned charset will fail to handle
* the full set of input data.
- * <p/>
+ * <p>
* Raise an exception if
- * <ul>
- * <li>no charset appears to match the data.</li>
- * <li>no input text has been provided</li>
- * </ul>
+ * <ul>
+ * <li>no charset appears to match the data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
*
* @return a CharsetMatch object representing the best matching charset, or
- * <code>null</code> if there are no matches.
- *
+ * <code>null</code> if there are no matches.
* @stable ICU 3.4
*/
public CharsetMatch detect() {
@@ -291,48 +269,36 @@ public class CharsetDetector {
}
/**
- * Return an array of all charsets that appear to be plausible
- * matches with the input data. The array is ordered with the
- * best quality match first.
- * <p/>
+ * Return an array of all charsets that appear to be plausible
+ * matches with the input data. The array is ordered with the
+ * best quality match first.
+ * <p>
* Raise an exception if
- * <ul>
- * <li>no charsets appear to match the input data.</li>
- * <li>no input text has been provided</li>
- * </ul>
+ * <ul>
+ * <li>no charsets appear to match the input data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
*
* @return An array of CharsetMatch objects representing possibly matching charsets.
- *
* @stable ICU 3.4
*/
public CharsetMatch[] detectAll() {
- CharsetRecognizer csr;
- int i;
- int detectResults;
- int confidence;
- ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+ ArrayList<CharsetMatch> matches = new ArrayList<>();
+
+ MungeInput(); // Strip html markup, collect byte stats.
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
- for (i = 0; i < fCSRecognizers.size(); i++) {
- csr = fCSRecognizers.get(i);
- detectResults = csr.match(this);
- confidence = detectResults & 0x000000ff;
- if (confidence > 0) {
- // Just to be safe, constrain
- confidence = Math.min(confidence, MAX_CONFIDENCE);
-
- // Apply charset hint.
- if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
- // Reduce lack of confidence (delta between "sure" and current) by 50%.
- confidence += (MAX_CONFIDENCE - confidence) / 2;
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+ boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
+ if (active) {
+ CharsetMatch m = rcinfo.recognizer.match(this);
+ if (m != null) {
+ matches.add(m);
}
-
- CharsetMatch m = new CharsetMatch(this, csr, confidence);
- matches.add(m);
}
}
-
Collections.sort(matches); // CharsetMatch compares on confidence
Collections.reverse(matches); // Put best match first.
CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
@@ -343,27 +309,25 @@ public class CharsetDetector {
/**
* Autodetect the charset of an inputStream, and return a Java Reader
* to access the converted input data.
- * <p/>
+ * <p>
* This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
- * <p/>
- * For the input stream that supplies the character data, markSupported()
- * must be true; the charset detection will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
- *<p/>
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+ * <p>
+ * For the input stream that supplies the character data, markSupported()
+ * must be true; the charset detection will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ * <p>
* Raise an exception if no charsets appear to match the input data.
*
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
+ * @param in The source of the byte data in the unknown charset.
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
* @stable ICU 3.4
*/
public Reader getReader(InputStream in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
+ fDeclaredEncoding = declaredEncoding;
try {
setText(in);
@@ -379,25 +343,24 @@ public class CharsetDetector {
return null;
}
}
+ // gave us a byte array.
/**
* Autodetect the charset of an inputStream, and return a String
* containing the converted input data.
- * <p/>
+ * <p>
* This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
- *<p/>
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+ * <p>
* Raise an exception if no charsets appear to match the input data.
*
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
+ * @param in The source of the byte data in the unknown charset.
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
* @stable ICU 3.4
*/
public String getString(byte[] in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
+ fDeclaredEncoding = declaredEncoding;
try {
setText(in);
@@ -413,30 +376,27 @@ public class CharsetDetector {
return null;
}
}
- // gave us a byte array.
/**
* Test whether or not input filtering is enabled.
*
* @return <code>true</code> if input text will be filtered.
- *
- * @see #enableInputFilter
- *
* @stable ICU 3.4
+ * @see #enableInputFilter
*/
public boolean inputFilterEnabled() {
return fStripTags;
}
+ // been changed from the default. The array index is
+ // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
/**
* Enable filtering of input text. If filtering is enabled,
- * text within angle brackets ("<" and ">") will be removed
+ * text within angle brackets ("<" and ">") will be removed
* before detection.
*
* @param filter <code>true</code> to enable input text filtering.
- *
* @return The previous setting.
- *
* @stable ICU 3.4
*/
public boolean enableInputFilter(boolean filter) {
@@ -447,22 +407,6 @@ public class CharsetDetector {
return previous;
}
- /**
- * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
- *
- * @param encoding - name of character encoding
- */
- private void setCanonicalDeclaredEncoding(String encoding) {
- if ((encoding == null) || encoding.isEmpty()) {
- return;
- }
-
- Charset cs = Charset.forName(encoding);
- if (cs != null) {
- fDeclaredEncoding = cs.name();
- }
- }
-
/*
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
* it by removing what appears to be html markup.
@@ -541,4 +485,83 @@ public class CharsetDetector {
}
}
}
-}
+
+ /**
+ * Get the names of charsets that can be recognized by this CharsetDetector instance.
+ *
+ * @return an array of the names of charsets that can be recognized by this CharsetDetector
+ * instance.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public String[] getDetectableCharsets() {
+ List<String> csnames = new ArrayList<>(ALL_CS_RECOGNIZERS.size());
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+ boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
+ if (active) {
+ csnames.add(rcinfo.recognizer.getName());
+ }
+ }
+ return csnames.toArray(new String[csnames.size()]);
+ }
+
+ /**
+ * Enable or disable individual charset encoding.
+ * A name of charset encoding must be included in the names returned by
+ * {@link #getAllDetectableCharsets()}.
+ *
+ * @param encoding the name of charset encoding.
+ * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
+ * charset encoding.
+ * @return A reference to this <code>CharsetDetector</code>.
+ * @throws IllegalArgumentException when the name of charset encoding is
+ * not supported.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
+ int modIdx = -1;
+ boolean isDefaultVal = false;
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
+ if (csrinfo.recognizer.getName().equals(encoding)) {
+ modIdx = i;
+ isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
+ break;
+ }
+ }
+ if (modIdx < 0) {
+ // No matching encoding found
+ throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
+ }
+
+ if (fEnabledRecognizers == null && !isDefaultVal) {
+ // Create an array storing the non default setting
+ fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
+
+ // Initialize the array with default info
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
+ }
+ }
+
+ if (fEnabledRecognizers != null) {
+ fEnabledRecognizers[modIdx] = enabled;
+ }
+
+ return this;
+ }
+
+ private static class CSRecognizerInfo {
+ CharsetRecognizer recognizer;
+ boolean isDefaultEnabled;
+
+ CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
+ this.recognizer = recognizer;
+ this.isDefaultEnabled = isDefaultEnabled;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 22219ab..40a10ce 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -18,63 +20,56 @@ import java.io.Reader;
* as a possible encoding for a set of input data. From an instance of this
* class, you can ask for a confidence level in the charset identification,
* or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
+ * <p>
* Instances of this class are created only by CharsetDetectors.
- * <p/>
+ * <p>
* Note: this class has a natural ordering that is inconsistent with equals.
- * The natural ordering is based on the match confidence value.
+ * The natural ordering is based on the match confidence value.
*
* @stable ICU 3.4
*/
public class CharsetMatch implements Comparable<CharsetMatch> {
- /**
- * Bit flag indicating the match is based on the the encoding scheme.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int ENCODING_SCHEME = 1;
- /**
- * Bit flag indicating the match is based on the presence of a BOM.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int BOM = 2;
- /**
- * Bit flag indicating he match is based on the declared encoding.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int DECLARED_ENCODING = 4;
- /**
- * Bit flag indicating the match is based on language statistics.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int LANG_STATISTICS = 8;
//
// Private Data
//
private int fConfidence;
- private CharsetRecognizer fRecognizer;
private byte[] fRawInput = null; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
private int fRawLength; // Length of data in fRawInput array.
private InputStream fInputStream = null; // User's input stream, or null if the user
+ private String fCharsetName; // The name of the charset this CharsetMatch
+ // represents. Filled in by the recognizer.
+ private String fLang; // The language, if one was determined by
/*
* Constructor. Implementation internal
*/
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
- fRecognizer = rec;
fConfidence = conf;
- // The references to the original aplication input data must be copied out
+ // The references to the original application input data must be copied out
+ // of the charset recognizer to here, in case the application resets the
+ // recognizer before using this CharsetMatch.
+ if (det.fInputStream == null) {
+ // We only want the existing input byte data if it came straight from the user,
+ // not if is just the head of a stream.
+ fRawInput = det.fRawInput;
+ fRawLength = det.fRawLength;
+ }
+ fInputStream = det.fInputStream;
+ fCharsetName = rec.getName();
+ fLang = rec.getLanguage();
+ }
+
+ /*
+ * Constructor. Implementation internal
+ */
+ CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
+ fConfidence = conf;
+
+ // The references to the original application input data must be copied out
// of the charset recognizer to here, in case the application resets the
// recognizer before using this CharsetMatch.
if (det.fInputStream == null) {
@@ -84,19 +79,20 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
fRawLength = det.fRawLength;
}
fInputStream = det.fInputStream;
+ fCharsetName = csName;
+ fLang = lang;
}
/**
* Create a java.io.Reader for reading the Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
- * <p/>
+ * <p>
* CAUTION: if the source of the byte data was an InputStream, a Reader
* can be created for only one matching char set using this method. If more
* than one charset needs to be tried, the caller will need to reset
* the InputStream and create InputStreamReaders itself, based on the charset name.
*
* @return the Reader for the Unicode character data.
- *
* @stable ICU 3.4
*/
public Reader getReader() {
@@ -119,10 +115,9 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* to the original byte data supplied to the Charset detect operation.
*
* @return a String created from the converted input data.
- *
* @stable ICU 3.4
*/
- public String getString() throws java.io.IOException {
+ public String getString() throws IOException {
return getString(-1);
}
@@ -138,13 +133,12 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* source of the data is an input stream, or -1 for
* unlimited length.
* @return a String created from the converted input data.
- *
* @stable ICU 3.4
*/
- public String getString(int maxLength) throws java.io.IOException {
+ public String getString(int maxLength) throws IOException {
String result = null;
if (fInputStream != null) {
- StringBuffer sb = new StringBuffer();
+ StringBuilder sb = new StringBuilder();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
@@ -159,7 +153,17 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
return sb.toString();
} else {
- result = new String(fRawInput, getName());
+ String name = getName();
+ /*
+ * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
+ * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
+ * should be stripped off before creating the string.
+ */
+ int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
+ if (startSuffix > 0) {
+ name = name.substring(0, startSuffix);
+ }
+ result = new String(fRawInput, name);
}
return result;
@@ -172,7 +176,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* charset.
*
* @return the confidence in the charset match
- *
* @stable ICU 3.4
*/
public int getConfidence() {
@@ -180,26 +183,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
}
/**
- * Return flags indicating what it was about the input data
- * that caused this charset to be considered as a possible match.
- * The result is a bitfield containing zero or more of the flags
- * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
- * A result of zero means no information is available.
- * <p>
- * Note: currently, this method always returns zero.
- * <p>
- *
- * @return the type of match found for this charset.
- *
- * @draft ICU 3.4
- * @provisional This API might change or be removed in a future release.
- */
- public int getMatchType() {
-// TODO: create a list of enum-like constants for common combinations of types of matches.
- return 0;
- }
-
- /**
* Get the name of the detected charset.
* The name will be one that can be used with other APIs on the
* platform that accept charset names. It is the "Canonical name"
@@ -207,40 +190,38 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* charsets that are registered with the IANA charset registry,
* this is the MIME-preferred registerd name.
*
- * @see java.nio.charset.Charset
- * @see java.io.InputStreamReader
- *
* @return The name of the charset.
- *
* @stable ICU 3.4
+ * @see java.nio.charset.Charset
+ * @see InputStreamReader
*/
public String getName() {
- return fRecognizer.getName();
+ return fCharsetName;
}
+ // gave us a byte array.
/**
* Get the ISO code for the language of the detected charset.
*
* @return The ISO code for the language or <code>null</code> if the language cannot be determined.
- *
* @stable ICU 3.4
*/
public String getLanguage() {
- return fRecognizer.getLanguage();
+ return fLang;
}
/**
* Compare to other CharsetMatch objects.
* Comparison is based on the match confidence value, which
- * allows CharsetDetector.detectAll() to order its results.
+ * allows CharsetDetector.detectAll() to order its results.
*
- * @param o the CharsetMatch object to compare against.
+ * @param other the CharsetMatch object to compare against.
* @return a negative integer, zero, or a positive integer as the
- * confidence level of this CharsetMatch
- * is less than, equal to, or greater than that of
- * the argument.
+ * confidence level of this CharsetMatch
+ * is less than, equal to, or greater than that of
+ * the argument.
* @throws ClassCastException if the argument is not a CharsetMatch.
- * @stable ICU 3.4
+ * @stable ICU 4.4
*/
public int compareTo(CharsetMatch other) {
int compareResult = 0;
@@ -251,36 +232,5 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
}
return compareResult;
}
-
- /**
- * compare this CharsetMatch to another based on confidence value
- * @param o the CharsetMatch object to compare against
- * @return true if equal
- */
- public boolean equals(Object o) {
- if (o instanceof CharsetMatch) {
- CharsetMatch that = (CharsetMatch) o;
- return (this.fConfidence == that.fConfidence);
- }
-
- return false;
- }
-
- /**
- * generates a hashCode based on the confidence value
- * @return the hashCode
- */
- public int hashCode() {
- return fConfidence;
- }
- // gave us a byte array.
-
- public String toString() {
- String s = "Match of " + fRecognizer.getName();
- if (fRecognizer.getLanguage() != null) {
- s += " in " + fRecognizer.getLanguage();
- }
- s += " with confidence " + fConfidence;
- return s;
- }
-}
+ // the recognizer during the detect operation.
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 129c9a8..d4805be 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
+* Copyright (C) 2005 - 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@@ -11,10 +13,8 @@ package org.apache.tika.parser.txt;
* This is a superclass for the individual detectors for
* each of the detectable members of the ISO 2022 family
* of encodings.
- * <p/>
+ * <p>
* The separate classes are nested within this class.
- *
- * @internal
*/
abstract class CharsetRecog_2022 extends CharsetRecognizer {
@@ -74,7 +74,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
//
// Initial quality is based on relative proportion of recongized vs.
- // unrecognized escape sequences.
+ // unrecognized escape sequences.
// All good: quality = 100;
// half or less good: quality = 0;
// linear inbetween.
@@ -114,8 +114,9 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
return "ISO-2022-JP";
}
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -128,10 +129,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
return "ISO-2022-KR";
}
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
-
}
static class CharsetRecog_2022CN extends CharsetRecog_2022 {
@@ -153,11 +154,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
return "ISO-2022-CN";
}
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
}
-
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index 55a3957..a5100bc 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
+ * Copyright (C) 2005 - 2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -8,8 +10,6 @@ package org.apache.tika.parser.txt;
/**
* Charset recognizer for UTF-8
- *
- * @internal
*/
class CharsetRecog_UTF8 extends CharsetRecognizer {
@@ -20,7 +20,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
*/
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
boolean hasBOM = false;
int numValid = 0;
int numInvalid = 0;
@@ -50,10 +50,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
trailBytes = 3;
} else {
numInvalid++;
- if (numInvalid > 5) {
- break;
- }
- trailBytes = 0;
+ continue;
}
// Verify that we've got the right number of trail bytes in the sequence
@@ -72,7 +69,6 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
break;
}
}
-
}
// Cook up some sort of confidence score, based on presense of a BOM
@@ -87,13 +83,15 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
} else if (numValid > 0 && numInvalid == 0) {
confidence = 80;
} else if (numValid == 0 && numInvalid == 0) {
- // Plain ASCII.
- confidence = 10;
+ // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+ // accepts ASCII with confidence = 10.
+ // TODO: add plain ASCII as an explicitly detected type.
+ confidence = 15;
} else if (numValid > numInvalid * 10) {
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
confidence = 25;
}
- return confidence;
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
index be6455f..a92acc1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
@@ -1,20 +1,44 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and *
+ * Copyright (C) 1996-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*/
+
package org.apache.tika.parser.txt;
/**
* This class matches UTF-16 and UTF-32, both big- and little-endian. The
* BOM will be used if it is present.
- *
- * @internal
*/
abstract class CharsetRecog_Unicode extends CharsetRecognizer {
+ static int codeUnit16FromBytes(byte hi, byte lo) {
+ return ((hi & 0xff) << 8) | (lo & 0xff);
+ }
+
+ // UTF-16 confidence calculation. Very simple minded, but better than nothing.
+ // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
+ // and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
+ // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
+ // NULs should be rare in actual text.
+ static int adjustConfidence(int codeUnit, int confidence) {
+ if (codeUnit == 0) {
+ confidence -= 10;
+ } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
+ confidence += 10;
+ }
+ if (confidence < 0) {
+ confidence = 0;
+ } else if (confidence > 100) {
+ confidence = 100;
+ }
+ return confidence;
+ }
+
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#getName()
*/
@@ -23,22 +47,36 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
*/
- abstract int match(CharsetDetector det);
+ abstract CharsetMatch match(CharsetDetector det);
static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
String getName() {
return "UTF-16BE";
}
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
byte[] input = det.fRawInput;
-
- if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
- return 100;
+ int confidence = 10;
+
+ int bytesToCheck = Math.min(input.length, 30);
+ for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+ int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
+ if (charIndex == 0 && codeUnit == 0xFEFF) {
+ confidence = 100;
+ break;
+ }
+ confidence = adjustConfidence(codeUnit, confidence);
+ if (confidence == 0 || confidence == 100) {
+ break;
+ }
}
-
- // TODO: Do some statistics to check for unsigned UTF-16BE
- return 0;
+ if (bytesToCheck < 4 && confidence < 100) {
+ confidence = 0;
+ }
+ if (confidence > 0) {
+ return new CharsetMatch(det, this, confidence);
+ }
+ return null;
}
}
@@ -47,20 +85,29 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
return "UTF-16LE";
}
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
byte[] input = det.fRawInput;
-
- if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
- // An LE BOM is present.
- if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
- // It is probably UTF-32 LE, not UTF-16
- return 0;
+ int confidence = 10;
+
+ int bytesToCheck = Math.min(input.length, 30);
+ for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+ int codeUnit = codeUnit16FromBytes(input[charIndex + 1], input[charIndex]);
+ if (charIndex == 0 && codeUnit == 0xFEFF) {
+ confidence = 100;
+ break;
+ }
+ confidence = adjustConfidence(codeUnit, confidence);
+ if (confidence == 0 || confidence == 100) {
+ break;
}
- return 100;
}
-
- // TODO: Do some statistics to check for unsigned UTF-16LE
- return 0;
+ if (bytesToCheck < 4 && confidence < 100) {
+ confidence = 0;
+ }
+ if (confidence > 0) {
+ return new CharsetMatch(det, this, confidence);
+ }
+ return null;
}
}
@@ -69,7 +116,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
abstract String getName();
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
byte[] input = det.fRawInput;
int limit = (det.fRawLength / 4) * 4;
int numValid = 0;
@@ -78,7 +125,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
int confidence = 0;
if (limit == 0) {
- return 0;
+ return null;
}
if (getChar(input, 0) == 0x0000FEFF) {
hasBOM = true;
@@ -110,7 +157,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
confidence = 25;
}
- return confidence;
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -136,4 +183,4 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
return "UTF-32LE";
}
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
index 35d2b4f..3c38cd0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
****************************************************************************
- * Copyright (C) 2005-2008, International Business Machines Corporation and *
+ * Copyright (C) 2005-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
****************************************************************************
*
@@ -20,8 +22,6 @@ import java.util.Arrays;
* CharsetDetector class and kept in the global list of available
* encodings to be checked. The specific encoding being recognized
* is determined by subclass.
- *
- * @internal
*/
abstract class CharsetRecog_mbcs extends CharsetRecognizer {
@@ -46,7 +46,8 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* bits 8-15: The match reason, an enum-like value.
*/
int match(CharsetDetector det, int[] commonChars) {
- int singleByteCharCount = 0;
+ @SuppressWarnings("unused")
+ int singleByteCharCount = 0; //TODO Do we really need this?
int doubleByteCharCount = 0;
int commonCharCount = 0;
int badCharCount = 0;
@@ -132,7 +133,7 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* Get the next character (however many bytes it is) from the input data
* Subclasses for specific charset encodings must implement this function
* to get characters according to the rules of their encoding scheme.
- * <p/>
+ * <p>
* This function is not a method of class iteratedChar only because
* that would require a lot of extra derived classes, which is awkward.
*
@@ -156,14 +157,12 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
//
static class iteratedChar {
int charValue = 0; // 1-4 bytes from the raw input data
- int index = 0;
int nextIndex = 0;
boolean error = false;
boolean done = false;
void reset() {
charValue = 0;
- index = -1;
nextIndex = 0;
error = false;
done = false;
@@ -195,7 +194,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte;
firstByte = it.charValue = it.nextByte(det);
@@ -219,8 +217,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return true;
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
String getName() {
@@ -255,7 +254,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte;
firstByte = it.charValue = it.nextByte(det);
@@ -282,8 +280,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return true;
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
String getName() {
@@ -311,7 +310,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* packed into an int.
*/
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte = 0;
int secondByte = 0;
@@ -392,8 +390,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return "EUC-JP";
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
public String getLanguage() {
@@ -425,8 +424,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return "EUC-KR";
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
public String getLanguage() {
@@ -462,7 +462,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* packed into an int.
*/
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte = 0;
int secondByte = 0;
@@ -519,8 +518,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return "GB18030";
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
public String getLanguage() {
@@ -529,4 +529,4 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
}
-}
+}
\ No newline at end of file