You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/27 00:41:19 UTC
[3/3] tika git commit: TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.

TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9f6c71fa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9f6c71fa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9f6c71fa

Branch: refs/heads/2.x
Commit: 9f6c71fa69eaae558aff85cfa0dce72bca08fd4e
Parents: f89887d
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 20:41:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 20:41:10 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../apache/tika/parser/txt/CharsetDetector.java | 437 ++++-----
 .../apache/tika/parser/txt/CharsetMatch.java    | 170 ++--
 .../tika/parser/txt/CharsetRecog_2022.java      |  28 +-
 .../tika/parser/txt/CharsetRecog_UTF8.java      |  24 +-
 .../tika/parser/txt/CharsetRecog_Unicode.java   |  99 +-
 .../tika/parser/txt/CharsetRecog_mbcs.java      |  44 +-
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 903 +++++++++----------
 .../tika/parser/txt/CharsetRecognizer.java      |  31 +-
 .../tika/parser/txt/Icu4jEncodingDetector.java  |   8 +
 .../apache/tika/parser/html/HtmlParserTest.java | 112 +++
 11 files changed, 964 insertions(+), 895 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e5b5050..abfbdec 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Upgrade ICU4J charset detection components to fix multithreading
+    bug (TIKA-2041).
+
   * Upgrade to Jackcess 2.1.4 (TIKA-2039).
 
   * Maintain more significant digits in cells of "General" format

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index f9df9e0..1ee7f28 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and    *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -9,30 +11,37 @@ package org.apache.tika.parser.txt;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.List;
 
 
 /**
+ * NOTE: This was copied from ICU4J with two modifications:
+ * Apache Tika added the EBCDIC-500 family of detectors, and
+ * we increased the buffer to 12000 bytes.
+ *
+ * <p>
+ *
  * <code>CharsetDetector</code> provides a facility for detecting the
  * charset or encoding of character data in an unknown format.
  * The input data can either be from an input stream or an array of bytes.
  * The result of the detection operation is a list of possibly matching
  * charsets, or, for simple use, you can just ask for a Java Reader that
  * will will work over the input data.
- * <p/>
+ * <p>
  * Character set detection is at best an imprecise operation.  The detection
  * process will attempt to identify the charset that best matches the characteristics
  * of the byte data, but the process is partly statistical in nature, and
  * the results can not be guaranteed to always be correct.
- * <p/>
+ * <p>
  * For best accuracy in charset detection, the input data should be primarily
  * in a single language, and a minimum of a few hundred bytes worth of plain text
  * in the language are needed.  The detection process will attempt to
  * ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
+ * <p>
+ *
  * @stable ICU 3.4
  */
 public class CharsetDetector {
@@ -47,13 +56,58 @@ public class CharsetDetector {
 //   actually choose the "real" charset.  All assuming that the application just
 //   wants the data, and doesn't care about a char set name.
 
-    private static final int kBufSize = 12000;
-    private static final int MAX_CONFIDENCE = 100;
-    private static String[] fCharsetNames;
+    private static final int kBufSize = 12000;//legacy value; more recent value is 8000
     /*
      * List of recognizers for all charsets known to the implementation.
      */
-    private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+    private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
+
+    static {
+        List<CSRecognizerInfo> list = new ArrayList<>();
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true));
+
+        // IBM 420/424 recognizers are disabled by default
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
+
+        ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
+    }
+
     /*
      *  The following items are accessed by individual CharsetRecongizers during
      *     the recognition process
@@ -61,26 +115,27 @@ public class CharsetDetector {
      */
     byte[] fInputBytes =       // The text to be checked.  Markup will have been
             new byte[kBufSize];  //   removed if appropriate.
-    int fInputLen;          // Length of the byte data in fInputText.
+    int fInputLen;          // Length of the byte data in fInputBytes.
     short fByteStats[] =      // byte frequency statistics for the input text.
             new short[256];  //   Value is percent, not absolute.
     boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
             false;
     String fDeclaredEncoding;
-    //
-    //  Stuff private to CharsetDetector
-    //
     byte[] fRawInput;     // Original, untouched input bytes.
     //  If user gave us a byte array, this is it.
     //  If user gave us a stream, it's read to a
     //  buffer here.
     int fRawLength;    // Length of data in fRawInput array.
     InputStream fInputStream;  // User's input stream, or null if the user
-    boolean fStripTags =   // If true, setText() will strip tags from input text.
+    //
+    //  Stuff private to CharsetDetector
+    //
+    private boolean fStripTags =   // If true, setText() will strip tags from input text.
             false;
+    private boolean[] fEnabledRecognizers;   // If not null, active set of charset recognizers had
 
     /**
-     *   Constructor
+     * Constructor
      *
      * @stable ICU 3.4
      */
@@ -88,149 +143,73 @@ public class CharsetDetector {
     }
 
     /**
-     * Get the names of all char sets that can be recognized by the char set detector.
-     *
-     * @return an array of the names of all charsets that can be recognized
-     * by the charset detector.
+     * Get the names of all charsets supported by <code>CharsetDetector</code> class.
+     * <p>
+     * <b>Note:</b> Multiple different charset encodings in a same family may use
+     * a single shared name in this implementation. For example, this method returns
+     * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+     * (Windows Latin 1). However, actual detection result could be "windows-1252"
+     * when the input data matches Latin 1 code points with any points only available
+     * in "windows-1252".
      *
+     * @return an array of the names of all charsets supported by
+     * <code>CharsetDetector</code> class.
      * @stable ICU 3.4
      */
     public static String[] getAllDetectableCharsets() {
-        return fCharsetNames;
-    }
-
-    /*
-     * Create the singleton instances of the CharsetRecognizer classes
-     */
-    private static ArrayList<CharsetRecognizer> createRecognizers() {
-        ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
-        recognizers.add(new CharsetRecog_UTF8());
-
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
-        // Create an array of all charset names, as a side effect.
-        // Needed for the getAllDetectableCharsets() API.
-        String[] charsetNames = new String[recognizers.size()];
-        int out = 0;
-
-        for (CharsetRecognizer recognizer : recognizers) {
-            String name = recognizer.getName();
-
-            if (out == 0 || !name.equals(charsetNames[out - 1])) {
-                charsetNames[out++] = name;
-            }
+        String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
+        for (int i = 0; i < allCharsetNames.length; i++) {
+            allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
         }
-
-        fCharsetNames = new String[out];
-        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
-        return recognizers;
+        return allCharsetNames;
     }
 
     /**
      * Set the declared encoding for charset detection.
-     *  The declared encoding of an input text is an encoding obtained
-     *  from an http header or xml declaration or similar source that
-     *  can be provided as additional information to the charset detector.
-     *  A match between a declared encoding and a possible detected encoding
-     *  will raise the quality of that detected encoding by a small delta,
-     *  and will also appear as a "reason" for the match.
-     * <p/>
+     * The declared encoding of an input text is an encoding obtained
+     * from an http header or xml declaration or similar source that
+     * can be provided as additional information to the charset detector.
+     * A match between a declared encoding and a possible detected encoding
+     * will raise the quality of that detected encoding by a small delta,
+     * and will also appear as a "reason" for the match.
+     * <p>
      * A declared encoding that is incompatible with the input data being
      * analyzed will not be added to the list of possible encodings.
      *
-     *  @param encoding The declared encoding
-     *
+     * @param encoding The declared encoding
      * @stable ICU 3.4
      */
     public CharsetDetector setDeclaredEncoding(String encoding) {
-        setCanonicalDeclaredEncoding(encoding);
+        fDeclaredEncoding = encoding;
         return this;
     }
+    //   Value is rounded up, so zero really means zero occurences.
 
     /**
      * Set the input text (byte) data whose charset is to be detected.
      *
      * @param in the input text of unknown encoding
-     *
      * @return This CharsetDetector
-     *
      * @stable ICU 3.4
      */
     public CharsetDetector setText(byte[] in) {
         fRawInput = in;
         fRawLength = in.length;
 
-        MungeInput();
-
         return this;
     }
-    //   Value is rounded up, so zero really means zero occurences.
 
     /**
      * Set the input text (byte) data whose charset is to be detected.
-     *  <p/>
-     *   The input stream that supplies the character data must have markSupported()
-     *   == true; the charset detection process will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *   be read depends on the characteristics of the data itself.
+     * <p>
+     * The input stream that supplies the character data must have markSupported()
+     * == true; the charset detection process will read a small amount of data,
+     * then return the stream to its original position via
+     * the InputStream.reset() operation.  The exact amount that will
+     * be read depends on the characteristics of the data itself.
      *
      * @param in the input text of unknown encoding
-     *
      * @return This CharsetDetector
-     *
      * @stable ICU 3.4
      */
 
@@ -259,21 +238,20 @@ public class CharsetDetector {
 
     /**
      * Return the charset that best matches the supplied input data.
-     *
+     * <p>
      * Note though, that because the detection
      * only looks at the start of the input data,
      * there is a possibility that the returned charset will fail to handle
      * the full set of input data.
-     * <p/>
+     * <p>
      * Raise an exception if
-     *  <ul>
-     *    <li>no charset appears to match the data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
+     * <ul>
+     * <li>no charset appears to match the data.</li>
+     * <li>no input text has been provided</li>
+     * </ul>
      *
      * @return a CharsetMatch object representing the best matching charset, or
-     *         <code>null</code> if there are no matches.
-     *
+     * <code>null</code> if there are no matches.
      * @stable ICU 3.4
      */
     public CharsetMatch detect() {
@@ -291,48 +269,36 @@ public class CharsetDetector {
     }
 
     /**
-     *  Return an array of all charsets that appear to be plausible
-     *  matches with the input data.  The array is ordered with the
-     *  best quality match first.
-     * <p/>
+     * Return an array of all charsets that appear to be plausible
+     * matches with the input data.  The array is ordered with the
+     * best quality match first.
+     * <p>
      * Raise an exception if
-     *  <ul>
-     *    <li>no charsets appear to match the input data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
+     * <ul>
+     * <li>no charsets appear to match the input data.</li>
+     * <li>no input text has been provided</li>
+     * </ul>
      *
      * @return An array of CharsetMatch objects representing possibly matching charsets.
-     *
      * @stable ICU 3.4
      */
     public CharsetMatch[] detectAll() {
-        CharsetRecognizer csr;
-        int i;
-        int detectResults;
-        int confidence;
-        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+        ArrayList<CharsetMatch> matches = new ArrayList<>();
+
+        MungeInput();  // Strip html markup, collect byte stats.
 
         //  Iterate over all possible charsets, remember all that
         //    give a match quality > 0.
-        for (i = 0; i < fCSRecognizers.size(); i++) {
-            csr = fCSRecognizers.get(i);
-            detectResults = csr.match(this);
-            confidence = detectResults & 0x000000ff;
-            if (confidence > 0) {
-                // Just to be safe, constrain
-                confidence = Math.min(confidence, MAX_CONFIDENCE);
-
-                // Apply charset hint.
-                if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
-                    // Reduce lack of confidence (delta between "sure" and current) by 50%.
-                    confidence += (MAX_CONFIDENCE - confidence) / 2;
+        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+            boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
+            if (active) {
+                CharsetMatch m = rcinfo.recognizer.match(this);
+                if (m != null) {
+                    matches.add(m);
                 }
-
-                CharsetMatch m = new CharsetMatch(this, csr, confidence);
-                matches.add(m);
             }
         }
-
         Collections.sort(matches);      // CharsetMatch compares on confidence
         Collections.reverse(matches);   //  Put best match first.
         CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
@@ -343,27 +309,25 @@ public class CharsetDetector {
     /**
      * Autodetect the charset of an inputStream, and return a Java Reader
      * to access the converted input data.
-     * <p/>
+     * <p>
      * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
-     * <p/>
-     *   For the input stream that supplies the character data, markSupported()
-     *   must be true; the  charset detection will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *    be read depends on the characteristics of the data itself.
-     *<p/>
+     * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p>
+     * For the input stream that supplies the character data, markSupported()
+     * must be true; the  charset detection will read a small amount of data,
+     * then return the stream to its original position via
+     * the InputStream.reset() operation.  The exact amount that will
+     * be read depends on the characteristics of the data itself.
+     * <p>
      * Raise an exception if no charsets appear to match the input data.
      *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
+     * @param in               The source of the byte data in the unknown charset.
+     * @param declaredEncoding A declared encoding for the data, if available,
+     *                         or null or an empty string if none is available.
      * @stable ICU 3.4
      */
     public Reader getReader(InputStream in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
+        fDeclaredEncoding = declaredEncoding;
 
         try {
             setText(in);
@@ -379,25 +343,24 @@ public class CharsetDetector {
             return null;
         }
     }
+    //   gave us a byte array.
 
     /**
      * Autodetect the charset of an inputStream, and return a String
      * containing the converted input data.
-     * <p/>
+     * <p>
      * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
-     *<p/>
+     * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     * <p>
      * Raise an exception if no charsets appear to match the input data.
      *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
+     * @param in               The source of the byte data in the unknown charset.
+     * @param declaredEncoding A declared encoding for the data, if available,
+     *                         or null or an empty string if none is available.
      * @stable ICU 3.4
      */
     public String getString(byte[] in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
+        fDeclaredEncoding = declaredEncoding;
 
         try {
             setText(in);
@@ -413,30 +376,27 @@ public class CharsetDetector {
             return null;
         }
     }
-    //   gave us a byte array.
 
     /**
      * Test whether or not input filtering is enabled.
      *
      * @return <code>true</code> if input text will be filtered.
-     *
-     * @see #enableInputFilter
-     *
      * @stable ICU 3.4
+     * @see #enableInputFilter
      */
     public boolean inputFilterEnabled() {
         return fStripTags;
     }
+    // been changed from the default. The array index is
+    // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
 
     /**
      * Enable filtering of input text. If filtering is enabled,
-     * text within angle brackets ("<" and ">") will be removed
+     * text within angle brackets ("&lt;" and "&gt;") will be removed
      * before detection.
      *
      * @param filter <code>true</code> to enable input text filtering.
-     *
      * @return The previous setting.
-     *
      * @stable ICU 3.4
      */
     public boolean enableInputFilter(boolean filter) {
@@ -447,22 +407,6 @@ public class CharsetDetector {
         return previous;
     }
 
-    /**
-     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
-     *
-     * @param encoding - name of character encoding
-     */
-    private void setCanonicalDeclaredEncoding(String encoding) {
-        if ((encoding == null) || encoding.isEmpty()) {
-            return;
-        }
-
-        Charset cs = Charset.forName(encoding);
-        if (cs != null) {
-            fDeclaredEncoding = cs.name();
-        }
-    }
-
     /*
      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
      *               it by removing what appears to be html markup.
@@ -541,4 +485,83 @@ public class CharsetDetector {
             }
         }
     }
-}
+
+    /**
+     * Get the names of charsets that can be recognized by this CharsetDetector instance.
+     *
+     * @return an array of the names of charsets that can be recognized by this CharsetDetector
+     * instance.
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public String[] getDetectableCharsets() {
+        List<String> csnames = new ArrayList<>(ALL_CS_RECOGNIZERS.size());
+        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+            boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
+            if (active) {
+                csnames.add(rcinfo.recognizer.getName());
+            }
+        }
+        return csnames.toArray(new String[csnames.size()]);
+    }
+
+    /**
+     * Enable or disable individual charset encoding.
+     * A name of charset encoding must be included in the names returned by
+     * {@link #getAllDetectableCharsets()}.
+     *
+     * @param encoding the name of charset encoding.
+     * @param enabled  <code>true</code> to enable, or <code>false</code> to disable the
+     *                 charset encoding.
+     * @return A reference to this <code>CharsetDetector</code>.
+     * @throws IllegalArgumentException when the name of charset encoding is
+     *                                  not supported.
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
+        int modIdx = -1;
+        boolean isDefaultVal = false;
+        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
+            if (csrinfo.recognizer.getName().equals(encoding)) {
+                modIdx = i;
+                isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
+                break;
+            }
+        }
+        if (modIdx < 0) {
+            // No matching encoding found
+            throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
+        }
+
+        if (fEnabledRecognizers == null && !isDefaultVal) {
+            // Create an array storing the non default setting
+            fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
+
+            // Initialize the array with default info
+            for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+                fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
+            }
+        }
+
+        if (fEnabledRecognizers != null) {
+            fEnabledRecognizers[modIdx] = enabled;
+        }
+
+        return this;
+    }
+
+    private static class CSRecognizerInfo {
+        CharsetRecognizer recognizer;
+        boolean isDefaultEnabled;
+
+        CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
+            this.recognizer = recognizer;
+            this.isDefaultEnabled = isDefaultEnabled;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 22219ab..40a10ce 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and    *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -18,63 +20,56 @@ import java.io.Reader;
  * as a possible encoding for a set of input data.  From an instance of this
  * class, you can ask for a confidence level in the charset identification,
  * or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
+ * <p>
  * Instances of this class are created only by CharsetDetectors.
- * <p/>
+ * <p>
  * Note:  this class has a natural ordering that is inconsistent with equals.
- *        The natural ordering is based on the match confidence value.
+ * The natural ordering is based on the match confidence value.
  *
  * @stable ICU 3.4
  */
 public class CharsetMatch implements Comparable<CharsetMatch> {
 
 
-    /**
-     * Bit flag indicating the match is based on the the encoding scheme.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int ENCODING_SCHEME = 1;
-    /**
-     * Bit flag indicating the match is based on the presence of a BOM.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int BOM = 2;
-    /**
-     * Bit flag indicating he match is based on the declared encoding.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int DECLARED_ENCODING = 4;
-    /**
-     * Bit flag indicating the match is based on language statistics.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int LANG_STATISTICS = 8;
     //
     //   Private Data
     //
     private int fConfidence;
-    private CharsetRecognizer fRecognizer;
     private byte[] fRawInput = null;     // Original, untouched input bytes.
     //  If user gave us a byte array, this is it.
     private int fRawLength;           // Length of data in fRawInput array.
     private InputStream fInputStream = null;  // User's input stream, or null if the user
+    private String fCharsetName;         // The name of the charset this CharsetMatch
+    //   represents.  Filled in by the recognizer.
+    private String fLang;                // The language, if one was determined by
 
     /*
      *  Constructor.  Implementation internal
      */
     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
-        fRecognizer = rec;
         fConfidence = conf;
 
-        // The references to the original aplication input data must be copied out
+        // The references to the original application input data must be copied out
+        //   of the charset recognizer to here, in case the application resets the
+        //   recognizer before using this CharsetMatch.
+        if (det.fInputStream == null) {
+            // We only want the existing input byte data if it came straight from the user,
+            //   not if is just the head of a stream.
+            fRawInput = det.fRawInput;
+            fRawLength = det.fRawLength;
+        }
+        fInputStream = det.fInputStream;
+        fCharsetName = rec.getName();
+        fLang = rec.getLanguage();
+    }
+
+    /*
+     *  Constructor.  Implementation internal
+     */
+    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
+        fConfidence = conf;
+
+        // The references to the original application input data must be copied out
         //   of the charset recognizer to here, in case the application resets the
         //   recognizer before using this CharsetMatch.
         if (det.fInputStream == null) {
@@ -84,19 +79,20 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
             fRawLength = det.fRawLength;
         }
         fInputStream = det.fInputStream;
+        fCharsetName = csName;
+        fLang = lang;
     }
 
     /**
      * Create a java.io.Reader for reading the Unicode character data corresponding
      * to the original byte data supplied to the Charset detect operation.
-     * <p/>
+     * <p>
      * CAUTION:  if the source of the byte data was an InputStream, a Reader
      * can be created for only one matching char set using this method.  If more
      * than one charset needs to be tried, the caller will need to reset
      * the InputStream and create InputStreamReaders itself, based on the charset name.
      *
      * @return the Reader for the Unicode character data.
-     *
      * @stable ICU 3.4
      */
     public Reader getReader() {
@@ -119,10 +115,9 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      * to the original byte data supplied to the Charset detect operation.
      *
      * @return a String created from the converted input data.
-     *
      * @stable ICU 3.4
      */
-    public String getString() throws java.io.IOException {
+    public String getString() throws IOException {
         return getString(-1);
 
     }
@@ -138,13 +133,12 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      *                  source of the data is an input stream, or -1 for
      *                  unlimited length.
      * @return a String created from the converted input data.
-     *
      * @stable ICU 3.4
      */
-    public String getString(int maxLength) throws java.io.IOException {
+    public String getString(int maxLength) throws IOException {
         String result = null;
         if (fInputStream != null) {
-            StringBuffer sb = new StringBuffer();
+            StringBuilder sb = new StringBuilder();
             char[] buffer = new char[1024];
             Reader reader = getReader();
             int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
@@ -159,7 +153,17 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
 
             return sb.toString();
         } else {
-            result = new String(fRawInput, getName());
+            String name = getName();
+            /*
+             * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
+             * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
+             * should be stripped off before creating the string.
+             */
+            int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
+            if (startSuffix > 0) {
+                name = name.substring(0, startSuffix);
+            }
+            result = new String(fRawInput, name);
         }
         return result;
 
@@ -172,7 +176,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      * charset.
      *
      * @return the confidence in the charset match
-     *
      * @stable ICU 3.4
      */
     public int getConfidence() {
@@ -180,26 +183,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
     }
 
     /**
-     * Return flags indicating what it was about the input data
-     * that caused this charset to be considered as a possible match.
-     * The result is a bitfield containing zero or more of the flags
-     * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
-     * A result of zero means no information is available.
-     * <p>
-     * Note: currently, this method always returns zero.
-     * <p>
-     *
-     * @return the type of match found for this charset.
-     *
-     * @draft ICU 3.4
-     * @provisional This API might change or be removed in a future release.
-     */
-    public int getMatchType() {
-//      TODO: create a list of enum-like constants for common combinations of types of matches.
-        return 0;
-    }
-
-    /**
      * Get the name of the detected charset.
      * The name will be one that can be used with other APIs on the
      * platform that accept charset names.  It is the "Canonical name"
@@ -207,40 +190,38 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      * charsets that are registered with the IANA charset registry,
      * this is the MIME-preferred registerd name.
      *
-     * @see java.nio.charset.Charset
-     * @see java.io.InputStreamReader
-     *
      * @return The name of the charset.
-     *
      * @stable ICU 3.4
+     * @see java.nio.charset.Charset
+     * @see InputStreamReader
      */
     public String getName() {
-        return fRecognizer.getName();
+        return fCharsetName;
     }
+    //   gave us a byte array.
 
     /**
      * Get the ISO code for the language of the detected charset.
      *
      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
-     *
      * @stable ICU 3.4
      */
     public String getLanguage() {
-        return fRecognizer.getLanguage();
+        return fLang;
     }
 
     /**
      * Compare to other CharsetMatch objects.
      * Comparison is based on the match confidence value, which
-     *   allows CharsetDetector.detectAll() to order its results.
+     * allows CharsetDetector.detectAll() to order its results.
      *
-     * @param o the CharsetMatch object to compare against.
+     * @param other the CharsetMatch object to compare against.
      * @return a negative integer, zero, or a positive integer as the
-     *          confidence level of this CharsetMatch
-     *          is less than, equal to, or greater than that of
-     *          the argument.
+     * confidence level of this CharsetMatch
+     * is less than, equal to, or greater than that of
+     * the argument.
      * @throws ClassCastException if the argument is not a CharsetMatch.
-     * @stable ICU 3.4
+     * @stable ICU 4.4
      */
     public int compareTo(CharsetMatch other) {
         int compareResult = 0;
@@ -251,36 +232,5 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
         }
         return compareResult;
     }
-
-    /**
-     * compare this CharsetMatch to another based on confidence value
-     * @param o the CharsetMatch object to compare against
-     * @return true if equal
-     */
-    public boolean equals(Object o) {
-        if (o instanceof CharsetMatch) {
-            CharsetMatch that = (CharsetMatch) o;
-            return (this.fConfidence == that.fConfidence);
-        }
-
-        return false;
-    }
-
-    /**
-     * generates a hashCode based on the confidence value
-     * @return the hashCode
-     */
-    public int hashCode() {
-        return fConfidence;
-    }
-    //   gave us a byte array.
-
-    public String toString() {
-        String s = "Match of " + fRecognizer.getName();
-        if (fRecognizer.getLanguage() != null) {
-            s += " in " + fRecognizer.getLanguage();
-        }
-        s += " with confidence " + fConfidence;
-        return s;
-    }
-}
+    //   the recognizer during the detect operation.
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 129c9a8..d4805be 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
 *******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
+* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@@ -11,10 +13,8 @@ package org.apache.tika.parser.txt;
  * This is a superclass for the individual detectors for
  * each of the detectable members of the ISO 2022 family
  * of encodings.
- * <p/>
+ * <p>
  * The separate classes are nested within this class.
- *
- * @internal
  */
 abstract class CharsetRecog_2022 extends CharsetRecognizer {
 
@@ -74,7 +74,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
 
         //
         // Initial quality is based on relative proportion of recongized vs.
-        //   unrecognized escape sequences. 
+        //   unrecognized escape sequences.
         //   All good:  quality = 100;
         //   half or less good: quality = 0;
         //   linear inbetween.
@@ -114,8 +114,9 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
             return "ISO-2022-JP";
         }
 
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -128,10 +129,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
             return "ISO-2022-KR";
         }
 
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
-
     }
 
     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
@@ -153,11 +154,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
             return "ISO-2022-CN";
         }
 
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
 }
-

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index 55a3957..a5100bc 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
+ * Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -8,8 +10,6 @@ package org.apache.tika.parser.txt;
 
 /**
  * Charset recognizer for UTF-8
- *
- * @internal
  */
 class CharsetRecog_UTF8 extends CharsetRecognizer {
 
@@ -20,7 +20,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
-    int match(CharsetDetector det) {
+    CharsetMatch match(CharsetDetector det) {
         boolean hasBOM = false;
         int numValid = 0;
         int numInvalid = 0;
@@ -50,10 +50,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
                 trailBytes = 3;
             } else {
                 numInvalid++;
-                if (numInvalid > 5) {
-                    break;
-                }
-                trailBytes = 0;
+                continue;
             }
 
             // Verify that we've got the right number of trail bytes in the sequence
@@ -72,7 +69,6 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
                     break;
                 }
             }
-
         }
 
         // Cook up some sort of confidence score, based on presense of a BOM
@@ -87,13 +83,15 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
         } else if (numValid > 0 && numInvalid == 0) {
             confidence = 80;
         } else if (numValid == 0 && numInvalid == 0) {
-            // Plain ASCII.  
-            confidence = 10;
+            // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+            //              accepts ASCII with confidence = 10.
+            // TODO: add plain ASCII as an explicitly detected type.
+            confidence = 15;
         } else if (numValid > numInvalid * 10) {
             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
             confidence = 25;
         }
-        return confidence;
+        return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
     }
 
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
index be6455f..a92acc1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
@@ -1,20 +1,44 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
  *******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2013, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
  */
+
 package org.apache.tika.parser.txt;
 
 /**
  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
  * BOM will be used if it is present.
- *
- * @internal
  */
 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
 
+    static int codeUnit16FromBytes(byte hi, byte lo) {
+        return ((hi & 0xff) << 8) | (lo & 0xff);
+    }
+
+    // UTF-16 confidence calculation. Very simple minded, but better than nothing.
+    //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
+    //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
+    //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
+    //   NULs should be rare in actual text.
+    static int adjustConfidence(int codeUnit, int confidence) {
+        if (codeUnit == 0) {
+            confidence -= 10;
+        } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
+            confidence += 10;
+        }
+        if (confidence < 0) {
+            confidence = 0;
+        } else if (confidence > 100) {
+            confidence = 100;
+        }
+        return confidence;
+    }
+
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#getName()
      */
@@ -23,22 +47,36 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
-    abstract int match(CharsetDetector det);
+    abstract CharsetMatch match(CharsetDetector det);
 
     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
         String getName() {
             return "UTF-16BE";
         }
 
-        int match(CharsetDetector det) {
+        CharsetMatch match(CharsetDetector det) {
             byte[] input = det.fRawInput;
-
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
-                return 100;
+            int confidence = 10;
+
+            int bytesToCheck = Math.min(input.length, 30);
+            for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+                int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
+                if (charIndex == 0 && codeUnit == 0xFEFF) {
+                    confidence = 100;
+                    break;
+                }
+                confidence = adjustConfidence(codeUnit, confidence);
+                if (confidence == 0 || confidence == 100) {
+                    break;
+                }
             }
-
-            // TODO: Do some statistics to check for unsigned UTF-16BE
-            return 0;
+            if (bytesToCheck < 4 && confidence < 100) {
+                confidence = 0;
+            }
+            if (confidence > 0) {
+                return new CharsetMatch(det, this, confidence);
+            }
+            return null;
         }
     }
 
@@ -47,20 +85,29 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
             return "UTF-16LE";
         }
 
-        int match(CharsetDetector det) {
+        CharsetMatch match(CharsetDetector det) {
             byte[] input = det.fRawInput;
-
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
-                // An LE BOM is present.
-                if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
-                    // It is probably UTF-32 LE, not UTF-16
-                    return 0;
+            int confidence = 10;
+
+            int bytesToCheck = Math.min(input.length, 30);
+            for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+                int codeUnit = codeUnit16FromBytes(input[charIndex + 1], input[charIndex]);
+                if (charIndex == 0 && codeUnit == 0xFEFF) {
+                    confidence = 100;
+                    break;
+                }
+                confidence = adjustConfidence(codeUnit, confidence);
+                if (confidence == 0 || confidence == 100) {
+                    break;
                 }
-                return 100;
             }
-
-            // TODO: Do some statistics to check for unsigned UTF-16LE
-            return 0;
+            if (bytesToCheck < 4 && confidence < 100) {
+                confidence = 0;
+            }
+            if (confidence > 0) {
+                return new CharsetMatch(det, this, confidence);
+            }
+            return null;
         }
     }
 
@@ -69,7 +116,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
 
         abstract String getName();
 
-        int match(CharsetDetector det) {
+        CharsetMatch match(CharsetDetector det) {
             byte[] input = det.fRawInput;
             int limit = (det.fRawLength / 4) * 4;
             int numValid = 0;
@@ -78,7 +125,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
             int confidence = 0;
 
             if (limit == 0) {
-                return 0;
+                return null;
             }
             if (getChar(input, 0) == 0x0000FEFF) {
                 hasBOM = true;
@@ -110,7 +157,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
                 confidence = 25;
             }
 
-            return confidence;
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -136,4 +183,4 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
             return "UTF-32LE";
         }
     }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
index 35d2b4f..3c38cd0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
  ****************************************************************************
- * Copyright (C) 2005-2008, International Business Machines Corporation and *
+ * Copyright (C) 2005-2012, International Business Machines Corporation and *
  * others. All Rights Reserved.                                             *
  ****************************************************************************
  *
@@ -20,8 +22,6 @@ import java.util.Arrays;
  * CharsetDetector class and kept in the global list of available
  * encodings to be checked.  The specific encoding being recognized
  * is determined by subclass.
- *
- * @internal
  */
 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
 
@@ -46,7 +46,8 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
      * bits 8-15: The match reason, an enum-like value.
      */
     int match(CharsetDetector det, int[] commonChars) {
-        int singleByteCharCount = 0;
+        @SuppressWarnings("unused")
+        int singleByteCharCount = 0;  //TODO Do we really need this?
         int doubleByteCharCount = 0;
         int commonCharCount = 0;
         int badCharCount = 0;
@@ -132,7 +133,7 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
      * Get the next character (however many bytes it is) from the input data
      * Subclasses for specific charset encodings must implement this function
      * to get characters according to the rules of their encoding scheme.
-     * <p/>
+     * <p>
      * This function is not a method of class iteratedChar only because
      * that would require a lot of extra derived classes, which is awkward.
      *
@@ -156,14 +157,12 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     //
     static class iteratedChar {
         int charValue = 0;             // 1-4 bytes from the raw input data
-        int index = 0;
         int nextIndex = 0;
         boolean error = false;
         boolean done = false;
 
         void reset() {
             charValue = 0;
-            index = -1;
             nextIndex = 0;
             error = false;
             done = false;
@@ -195,7 +194,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                         0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
 
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte;
             firstByte = it.charValue = it.nextByte(det);
@@ -219,8 +217,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
             return true;
         }
 
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, commonChars);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
         String getName() {
@@ -255,7 +254,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                         0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
 
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte;
             firstByte = it.charValue = it.nextByte(det);
@@ -282,8 +280,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
             return true;
         }
 
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, commonChars);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
         String getName() {
@@ -311,7 +310,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
          *     packed into an int.
          */
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte = 0;
             int secondByte = 0;
@@ -392,8 +390,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 return "EUC-JP";
             }
 
-            int match(CharsetDetector det) {
-                return match(det, commonChars);
+            CharsetMatch match(CharsetDetector det) {
+                int confidence = match(det, commonChars);
+                return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
             }
 
             public String getLanguage() {
@@ -425,8 +424,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 return "EUC-KR";
             }
 
-            int match(CharsetDetector det) {
-                return match(det, commonChars);
+            CharsetMatch match(CharsetDetector det) {
+                int confidence = match(det, commonChars);
+                return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
             }
 
             public String getLanguage() {
@@ -462,7 +462,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
          *     packed into an int.
          */
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte = 0;
             int secondByte = 0;
@@ -519,8 +518,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
             return "GB18030";
         }
 
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, commonChars);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
         public String getLanguage() {
@@ -529,4 +529,4 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     }
 
 
-}
+}
\ No newline at end of file