You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/27 00:41:17 UTC

[1/3] tika git commit: TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.

Repository: tika
Updated Branches:
  refs/heads/2.x f89887d2f -> 9f6c71fa6


http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 3adaeee..5e8bddc 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -33,15 +33,29 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
 import java.io.Writer;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.regex.Pattern;
 
 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Geographic;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -1128,4 +1142,102 @@ public class HtmlParserTest extends TikaTest {
         XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
         assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", r.xml);
     }
+
+    @Test
+    public void testMultiThreadingEncodingDetection() throws Exception {
+        List<EncodingDetector> detectors = new ArrayList<>();
+        ServiceLoader loader =
+                new ServiceLoader(AutoDetectReader.class.getClassLoader());
+        detectors.addAll(loader.loadServiceProviders(EncodingDetector.class));
+        for (EncodingDetector detector : detectors) {
+            testDetector(detector);
+        }
+    }
+
+    private void testDetector(EncodingDetector detector) throws Exception {
+        Map<String, String> encodings = new ConcurrentHashMap<>();
+        List<String> tmpPaths = new ArrayList<>();
+        for (String p : new String[] {
+                "testHTML.html",
+                "testHTML_utf8.html",
+                "russian.cp866.txt",
+                "resume.html",
+                "test.html"
+
+        }) {
+            String testDocPath = "/test-documents/"+p;
+            String encoding = getEncoding(detector, testDocPath);
+            encodings.put(testDocPath, encoding);
+            //add 5 copies to add more chances for failure
+            for (int i = 0; i < 5; i++) {
+                tmpPaths.add(testDocPath);
+            }
+        }
+
+        Collections.shuffle(tmpPaths);
+
+        ArrayBlockingQueue<String> paths = new ArrayBlockingQueue<>(tmpPaths.size());
+        paths.addAll(tmpPaths);
+        int numThreads = paths.size()+1;
+        ExecutorService ex = Executors.newFixedThreadPool(numThreads);
+        CompletionService<String> completionService =
+                new ExecutorCompletionService<>(ex);
+
+        for (int i = 0; i < numThreads; i++) {
+            completionService.submit(new EncodingDetectorRunner(paths, encodings, detector));
+        }
+        int completed = 0;
+        while (completed < numThreads) {
+            Future<String> future = completionService.take();
+
+            if (future.isDone() &&
+                    //will trigger ExecutionException if an IOException
+                    //was thrown during call
+                    EncodingDetectorRunner.DONE.equals(future.get())) {
+                completed++;
+            }
+        }
+    }
+
+    private class EncodingDetectorRunner implements Callable<String> {
+
+        final static String DONE = "done";
+        private final ArrayBlockingQueue<String> paths;
+        private final Map<String, String> encodings;
+        private final EncodingDetector detector;
+        private EncodingDetectorRunner(ArrayBlockingQueue<String> paths,
+                                       Map<String, String> encodings, EncodingDetector detector) {
+            this.paths = paths;
+            this.encodings = encodings;
+            this.detector = detector;
+        }
+
+        @Override
+        public String call() throws IOException {
+            for (int i = 0; i < encodings.size(); i++) {
+                String p = paths.poll();
+                if (p == null) {
+                    return DONE;
+                }
+                String detectedEncoding = getEncoding(detector, p);
+                String trueEncoding = encodings.get(p);
+                assertEquals( "detector class="+detector.getClass() + " : file=",
+                        trueEncoding, detectedEncoding);
+
+            }
+            return DONE;
+        }
+    }
+
+    String getEncoding(EncodingDetector detector, String p) throws IOException {
+        try (InputStream is = TikaInputStream.get(getClass().getResourceAsStream(p))) {
+            Charset charset = detector.detect(is, new Metadata());
+            if (charset == null) {
+                return "NULL";
+            } else {
+                return charset.toString();
+            }
+        }
+    }
+
 }

[3/3] tika git commit: TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.

Posted by ta...@apache.org.

TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9f6c71fa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9f6c71fa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9f6c71fa

Branch: refs/heads/2.x
Commit: 9f6c71fa69eaae558aff85cfa0dce72bca08fd4e
Parents: f89887d
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 20:41:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 20:41:10 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../apache/tika/parser/txt/CharsetDetector.java | 437 ++++-----
 .../apache/tika/parser/txt/CharsetMatch.java    | 170 ++--
 .../tika/parser/txt/CharsetRecog_2022.java      |  28 +-
 .../tika/parser/txt/CharsetRecog_UTF8.java      |  24 +-
 .../tika/parser/txt/CharsetRecog_Unicode.java   |  99 +-
 .../tika/parser/txt/CharsetRecog_mbcs.java      |  44 +-
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 903 +++++++++----------
 .../tika/parser/txt/CharsetRecognizer.java      |  31 +-
 .../tika/parser/txt/Icu4jEncodingDetector.java  |   8 +
 .../apache/tika/parser/html/HtmlParserTest.java | 112 +++
 11 files changed, 964 insertions(+), 895 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e5b5050..abfbdec 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Upgrade ICU4J charset detection components to fix multithreading
+    bug (TIKA-2041).
+
   * Upgrade to Jackcess 2.1.4 (TIKA-2039).
 
   * Maintain more significant digits in cells of "General" format

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index f9df9e0..1ee7f28 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and    *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -9,30 +11,37 @@ package org.apache.tika.parser.txt;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.List;
 
 
 /**
+ * NOTE: This was copied from ICU4J with two modifications:
+ * Apache Tika added the EBCDIC-500 family of detectors, and
+ * we increased the buffer to 12000 bytes.
+ *
+ * <p>
+ *
  * <code>CharsetDetector</code> provides a facility for detecting the
  * charset or encoding of character data in an unknown format.
  * The input data can either be from an input stream or an array of bytes.
  * The result of the detection operation is a list of possibly matching
  * charsets, or, for simple use, you can just ask for a Java Reader that
  * will will work over the input data.
- * <p/>
+ * <p>
  * Character set detection is at best an imprecise operation.  The detection
  * process will attempt to identify the charset that best matches the characteristics
  * of the byte data, but the process is partly statistical in nature, and
  * the results can not be guaranteed to always be correct.
- * <p/>
+ * <p>
  * For best accuracy in charset detection, the input data should be primarily
  * in a single language, and a minimum of a few hundred bytes worth of plain text
  * in the language are needed.  The detection process will attempt to
  * ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
+ * <p>
+ *
  * @stable ICU 3.4
  */
 public class CharsetDetector {
@@ -47,13 +56,58 @@ public class CharsetDetector {
 //   actually choose the "real" charset.  All assuming that the application just
 //   wants the data, and doesn't care about a char set name.
 
-    private static final int kBufSize = 12000;
-    private static final int MAX_CONFIDENCE = 100;
-    private static String[] fCharsetNames;
+    private static final int kBufSize = 12000;//legacy value; more recent value is 8000
     /*
      * List of recognizers for all charsets known to the implementation.
      */
-    private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+    private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
+
+    static {
+        List<CSRecognizerInfo> list = new ArrayList<>();
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
+
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it(), true));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true));
+
+        // IBM 420/424 recognizers are disabled by default
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
+        list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
+
+        ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
+    }
+
     /*
      *  The following items are accessed by individual CharsetRecongizers during
      *     the recognition process
@@ -61,26 +115,27 @@ public class CharsetDetector {
      */
     byte[] fInputBytes =       // The text to be checked.  Markup will have been
             new byte[kBufSize];  //   removed if appropriate.
-    int fInputLen;          // Length of the byte data in fInputText.
+    int fInputLen;          // Length of the byte data in fInputBytes.
     short fByteStats[] =      // byte frequency statistics for the input text.
             new short[256];  //   Value is percent, not absolute.
     boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
             false;
     String fDeclaredEncoding;
-    //
-    //  Stuff private to CharsetDetector
-    //
     byte[] fRawInput;     // Original, untouched input bytes.
     //  If user gave us a byte array, this is it.
     //  If user gave us a stream, it's read to a
     //  buffer here.
     int fRawLength;    // Length of data in fRawInput array.
     InputStream fInputStream;  // User's input stream, or null if the user
-    boolean fStripTags =   // If true, setText() will strip tags from input text.
+    //
+    //  Stuff private to CharsetDetector
+    //
+    private boolean fStripTags =   // If true, setText() will strip tags from input text.
             false;
+    private boolean[] fEnabledRecognizers;   // If not null, active set of charset recognizers had
 
     /**
-     *   Constructor
+     * Constructor
      *
      * @stable ICU 3.4
      */
@@ -88,149 +143,73 @@ public class CharsetDetector {
     }
 
     /**
-     * Get the names of all char sets that can be recognized by the char set detector.
-     *
-     * @return an array of the names of all charsets that can be recognized
-     * by the charset detector.
+     * Get the names of all charsets supported by <code>CharsetDetector</code> class.
+     * <p>
+     * <b>Note:</b> Multiple different charset encodings in a same family may use
+     * a single shared name in this implementation. For example, this method returns
+     * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+     * (Windows Latin 1). However, actual detection result could be "windows-1252"
+     * when the input data matches Latin 1 code points with any points only available
+     * in "windows-1252".
      *
+     * @return an array of the names of all charsets supported by
+     * <code>CharsetDetector</code> class.
      * @stable ICU 3.4
      */
     public static String[] getAllDetectableCharsets() {
-        return fCharsetNames;
-    }
-
-    /*
-     * Create the singleton instances of the CharsetRecognizer classes
-     */
-    private static ArrayList<CharsetRecognizer> createRecognizers() {
-        ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
-        recognizers.add(new CharsetRecog_UTF8());
-
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
-        // Create an array of all charset names, as a side effect.
-        // Needed for the getAllDetectableCharsets() API.
-        String[] charsetNames = new String[recognizers.size()];
-        int out = 0;
-
-        for (CharsetRecognizer recognizer : recognizers) {
-            String name = recognizer.getName();
-
-            if (out == 0 || !name.equals(charsetNames[out - 1])) {
-                charsetNames[out++] = name;
-            }
+        String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
+        for (int i = 0; i < allCharsetNames.length; i++) {
+            allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
         }
-
-        fCharsetNames = new String[out];
-        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
-        return recognizers;
+        return allCharsetNames;
     }
 
     /**
      * Set the declared encoding for charset detection.
-     *  The declared encoding of an input text is an encoding obtained
-     *  from an http header or xml declaration or similar source that
-     *  can be provided as additional information to the charset detector.
-     *  A match between a declared encoding and a possible detected encoding
-     *  will raise the quality of that detected encoding by a small delta,
-     *  and will also appear as a "reason" for the match.
-     * <p/>
+     * The declared encoding of an input text is an encoding obtained
+     * from an http header or xml declaration or similar source that
+     * can be provided as additional information to the charset detector.
+     * A match between a declared encoding and a possible detected encoding
+     * will raise the quality of that detected encoding by a small delta,
+     * and will also appear as a "reason" for the match.
+     * <p>
      * A declared encoding that is incompatible with the input data being
      * analyzed will not be added to the list of possible encodings.
      *
-     *  @param encoding The declared encoding
-     *
+     * @param encoding The declared encoding
      * @stable ICU 3.4
      */
     public CharsetDetector setDeclaredEncoding(String encoding) {
-        setCanonicalDeclaredEncoding(encoding);
+        fDeclaredEncoding = encoding;
         return this;
     }
+    //   Value is rounded up, so zero really means zero occurences.
 
     /**
      * Set the input text (byte) data whose charset is to be detected.
      *
      * @param in the input text of unknown encoding
-     *
      * @return This CharsetDetector
-     *
      * @stable ICU 3.4
      */
     public CharsetDetector setText(byte[] in) {
         fRawInput = in;
         fRawLength = in.length;
 
-        MungeInput();
-
         return this;
     }
-    //   Value is rounded up, so zero really means zero occurences.
 
     /**
      * Set the input text (byte) data whose charset is to be detected.
-     *  <p/>
-     *   The input stream that supplies the character data must have markSupported()
-     *   == true; the charset detection process will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *   be read depends on the characteristics of the data itself.
+     * <p>
+     * The input stream that supplies the character data must have markSupported()
+     * == true; the charset detection process will read a small amount of data,
+     * then return the stream to its original position via
+     * the InputStream.reset() operation.  The exact amount that will
+     * be read depends on the characteristics of the data itself.
      *
      * @param in the input text of unknown encoding
-     *
      * @return This CharsetDetector
-     *
      * @stable ICU 3.4
      */
 
@@ -259,21 +238,20 @@ public class CharsetDetector {
 
     /**
      * Return the charset that best matches the supplied input data.
-     *
+     * <p>
      * Note though, that because the detection
      * only looks at the start of the input data,
      * there is a possibility that the returned charset will fail to handle
      * the full set of input data.
-     * <p/>
+     * <p>
      * Raise an exception if
-     *  <ul>
-     *    <li>no charset appears to match the data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
+     * <ul>
+     * <li>no charset appears to match the data.</li>
+     * <li>no input text has been provided</li>
+     * </ul>
      *
      * @return a CharsetMatch object representing the best matching charset, or
-     *         <code>null</code> if there are no matches.
-     *
+     * <code>null</code> if there are no matches.
      * @stable ICU 3.4
      */
     public CharsetMatch detect() {
@@ -291,48 +269,36 @@ public class CharsetDetector {
     }
 
     /**
-     *  Return an array of all charsets that appear to be plausible
-     *  matches with the input data.  The array is ordered with the
-     *  best quality match first.
-     * <p/>
+     * Return an array of all charsets that appear to be plausible
+     * matches with the input data.  The array is ordered with the
+     * best quality match first.
+     * <p>
      * Raise an exception if
-     *  <ul>
-     *    <li>no charsets appear to match the input data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
+     * <ul>
+     * <li>no charsets appear to match the input data.</li>
+     * <li>no input text has been provided</li>
+     * </ul>
      *
      * @return An array of CharsetMatch objects representing possibly matching charsets.
-     *
      * @stable ICU 3.4
      */
     public CharsetMatch[] detectAll() {
-        CharsetRecognizer csr;
-        int i;
-        int detectResults;
-        int confidence;
-        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+        ArrayList<CharsetMatch> matches = new ArrayList<>();
+
+        MungeInput();  // Strip html markup, collect byte stats.
 
         //  Iterate over all possible charsets, remember all that
         //    give a match quality > 0.
-        for (i = 0; i < fCSRecognizers.size(); i++) {
-            csr = fCSRecognizers.get(i);
-            detectResults = csr.match(this);
-            confidence = detectResults & 0x000000ff;
-            if (confidence > 0) {
-                // Just to be safe, constrain
-                confidence = Math.min(confidence, MAX_CONFIDENCE);
-
-                // Apply charset hint.
-                if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
-                    // Reduce lack of confidence (delta between "sure" and current) by 50%.
-                    confidence += (MAX_CONFIDENCE - confidence) / 2;
+        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+            boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
+            if (active) {
+                CharsetMatch m = rcinfo.recognizer.match(this);
+                if (m != null) {
+                    matches.add(m);
                 }
-
-                CharsetMatch m = new CharsetMatch(this, csr, confidence);
-                matches.add(m);
             }
         }
-
         Collections.sort(matches);      // CharsetMatch compares on confidence
         Collections.reverse(matches);   //  Put best match first.
         CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
@@ -343,27 +309,25 @@ public class CharsetDetector {
     /**
      * Autodetect the charset of an inputStream, and return a Java Reader
      * to access the converted input data.
-     * <p/>
+     * <p>
      * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
-     * <p/>
-     *   For the input stream that supplies the character data, markSupported()
-     *   must be true; the  charset detection will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *    be read depends on the characteristics of the data itself.
-     *<p/>
+     * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p>
+     * For the input stream that supplies the character data, markSupported()
+     * must be true; the  charset detection will read a small amount of data,
+     * then return the stream to its original position via
+     * the InputStream.reset() operation.  The exact amount that will
+     * be read depends on the characteristics of the data itself.
+     * <p>
      * Raise an exception if no charsets appear to match the input data.
      *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
+     * @param in               The source of the byte data in the unknown charset.
+     * @param declaredEncoding A declared encoding for the data, if available,
+     *                         or null or an empty string if none is available.
      * @stable ICU 3.4
      */
     public Reader getReader(InputStream in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
+        fDeclaredEncoding = declaredEncoding;
 
         try {
             setText(in);
@@ -379,25 +343,24 @@ public class CharsetDetector {
             return null;
         }
     }
+    //   gave us a byte array.
 
     /**
      * Autodetect the charset of an inputStream, and return a String
      * containing the converted input data.
-     * <p/>
+     * <p>
      * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
-     *<p/>
+     * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     * <p>
      * Raise an exception if no charsets appear to match the input data.
      *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
+     * @param in               The source of the byte data in the unknown charset.
+     * @param declaredEncoding A declared encoding for the data, if available,
+     *                         or null or an empty string if none is available.
      * @stable ICU 3.4
      */
     public String getString(byte[] in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
+        fDeclaredEncoding = declaredEncoding;
 
         try {
             setText(in);
@@ -413,30 +376,27 @@ public class CharsetDetector {
             return null;
         }
     }
-    //   gave us a byte array.
 
     /**
      * Test whether or not input filtering is enabled.
      *
      * @return <code>true</code> if input text will be filtered.
-     *
-     * @see #enableInputFilter
-     *
      * @stable ICU 3.4
+     * @see #enableInputFilter
      */
     public boolean inputFilterEnabled() {
         return fStripTags;
     }
+    // been changed from the default. The array index is
+    // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
 
     /**
      * Enable filtering of input text. If filtering is enabled,
-     * text within angle brackets ("<" and ">") will be removed
+     * text within angle brackets ("&lt;" and "&gt;") will be removed
      * before detection.
      *
      * @param filter <code>true</code> to enable input text filtering.
-     *
      * @return The previous setting.
-     *
      * @stable ICU 3.4
      */
     public boolean enableInputFilter(boolean filter) {
@@ -447,22 +407,6 @@ public class CharsetDetector {
         return previous;
     }
 
-    /**
-     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
-     *
-     * @param encoding - name of character encoding
-     */
-    private void setCanonicalDeclaredEncoding(String encoding) {
-        if ((encoding == null) || encoding.isEmpty()) {
-            return;
-        }
-
-        Charset cs = Charset.forName(encoding);
-        if (cs != null) {
-            fDeclaredEncoding = cs.name();
-        }
-    }
-
     /*
      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
      *               it by removing what appears to be html markup.
@@ -541,4 +485,83 @@ public class CharsetDetector {
             }
         }
     }
-}
+
+    /**
+     * Get the names of charsets that can be recognized by this CharsetDetector instance.
+     *
+     * @return an array of the names of charsets that can be recognized by this CharsetDetector
+     * instance.
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public String[] getDetectableCharsets() {
+        List<String> csnames = new ArrayList<>(ALL_CS_RECOGNIZERS.size());
+        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+            boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
+            if (active) {
+                csnames.add(rcinfo.recognizer.getName());
+            }
+        }
+        return csnames.toArray(new String[csnames.size()]);
+    }
+
+    /**
+     * Enable or disable individual charset encoding.
+     * A name of charset encoding must be included in the names returned by
+     * {@link #getAllDetectableCharsets()}.
+     *
+     * @param encoding the name of charset encoding.
+     * @param enabled  <code>true</code> to enable, or <code>false</code> to disable the
+     *                 charset encoding.
+     * @return A reference to this <code>CharsetDetector</code>.
+     * @throws IllegalArgumentException when the name of charset encoding is
+     *                                  not supported.
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
+        int modIdx = -1;
+        boolean isDefaultVal = false;
+        for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+            CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
+            if (csrinfo.recognizer.getName().equals(encoding)) {
+                modIdx = i;
+                isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
+                break;
+            }
+        }
+        if (modIdx < 0) {
+            // No matching encoding found
+            throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
+        }
+
+        if (fEnabledRecognizers == null && !isDefaultVal) {
+            // Create an array storing the non default setting
+            fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
+
+            // Initialize the array with default info
+            for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+                fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
+            }
+        }
+
+        if (fEnabledRecognizers != null) {
+            fEnabledRecognizers[modIdx] = enabled;
+        }
+
+        return this;
+    }
+
+    private static class CSRecognizerInfo {
+        CharsetRecognizer recognizer;
+        boolean isDefaultEnabled;
+
+        CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
+            this.recognizer = recognizer;
+            this.isDefaultEnabled = isDefaultEnabled;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 22219ab..40a10ce 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and    *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -18,63 +20,56 @@ import java.io.Reader;
  * as a possible encoding for a set of input data.  From an instance of this
  * class, you can ask for a confidence level in the charset identification,
  * or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
+ * <p>
  * Instances of this class are created only by CharsetDetectors.
- * <p/>
+ * <p>
  * Note:  this class has a natural ordering that is inconsistent with equals.
- *        The natural ordering is based on the match confidence value.
+ * The natural ordering is based on the match confidence value.
  *
  * @stable ICU 3.4
  */
 public class CharsetMatch implements Comparable<CharsetMatch> {
 
 
-    /**
-     * Bit flag indicating the match is based on the the encoding scheme.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int ENCODING_SCHEME = 1;
-    /**
-     * Bit flag indicating the match is based on the presence of a BOM.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int BOM = 2;
-    /**
-     * Bit flag indicating he match is based on the declared encoding.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int DECLARED_ENCODING = 4;
-    /**
-     * Bit flag indicating the match is based on language statistics.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int LANG_STATISTICS = 8;
     //
     //   Private Data
     //
     private int fConfidence;
-    private CharsetRecognizer fRecognizer;
     private byte[] fRawInput = null;     // Original, untouched input bytes.
     //  If user gave us a byte array, this is it.
     private int fRawLength;           // Length of data in fRawInput array.
     private InputStream fInputStream = null;  // User's input stream, or null if the user
+    private String fCharsetName;         // The name of the charset this CharsetMatch
+    //   represents.  Filled in by the recognizer.
+    private String fLang;                // The language, if one was determined by
 
     /*
      *  Constructor.  Implementation internal
      */
     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
-        fRecognizer = rec;
         fConfidence = conf;
 
-        // The references to the original aplication input data must be copied out
+        // The references to the original application input data must be copied out
+        //   of the charset recognizer to here, in case the application resets the
+        //   recognizer before using this CharsetMatch.
+        if (det.fInputStream == null) {
+            // We only want the existing input byte data if it came straight from the user,
+            //   not if is just the head of a stream.
+            fRawInput = det.fRawInput;
+            fRawLength = det.fRawLength;
+        }
+        fInputStream = det.fInputStream;
+        fCharsetName = rec.getName();
+        fLang = rec.getLanguage();
+    }
+
+    /*
+     *  Constructor.  Implementation internal
+     */
+    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
+        fConfidence = conf;
+
+        // The references to the original application input data must be copied out
         //   of the charset recognizer to here, in case the application resets the
         //   recognizer before using this CharsetMatch.
         if (det.fInputStream == null) {
@@ -84,19 +79,20 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
             fRawLength = det.fRawLength;
         }
         fInputStream = det.fInputStream;
+        fCharsetName = csName;
+        fLang = lang;
     }
 
     /**
      * Create a java.io.Reader for reading the Unicode character data corresponding
      * to the original byte data supplied to the Charset detect operation.
-     * <p/>
+     * <p>
      * CAUTION:  if the source of the byte data was an InputStream, a Reader
      * can be created for only one matching char set using this method.  If more
      * than one charset needs to be tried, the caller will need to reset
      * the InputStream and create InputStreamReaders itself, based on the charset name.
      *
      * @return the Reader for the Unicode character data.
-     *
      * @stable ICU 3.4
      */
     public Reader getReader() {
@@ -119,10 +115,9 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      * to the original byte data supplied to the Charset detect operation.
      *
      * @return a String created from the converted input data.
-     *
      * @stable ICU 3.4
      */
-    public String getString() throws java.io.IOException {
+    public String getString() throws IOException {
         return getString(-1);
 
     }
@@ -138,13 +133,12 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      *                  source of the data is an input stream, or -1 for
      *                  unlimited length.
      * @return a String created from the converted input data.
-     *
      * @stable ICU 3.4
      */
-    public String getString(int maxLength) throws java.io.IOException {
+    public String getString(int maxLength) throws IOException {
         String result = null;
         if (fInputStream != null) {
-            StringBuffer sb = new StringBuffer();
+            StringBuilder sb = new StringBuilder();
             char[] buffer = new char[1024];
             Reader reader = getReader();
             int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
@@ -159,7 +153,17 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
 
             return sb.toString();
         } else {
-            result = new String(fRawInput, getName());
+            String name = getName();
+            /*
+             * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
+             * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
+             * should be stripped off before creating the string.
+             */
+            int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
+            if (startSuffix > 0) {
+                name = name.substring(0, startSuffix);
+            }
+            result = new String(fRawInput, name);
         }
         return result;
 
@@ -172,7 +176,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      * charset.
      *
      * @return the confidence in the charset match
-     *
      * @stable ICU 3.4
      */
     public int getConfidence() {
@@ -180,26 +183,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
     }
 
     /**
-     * Return flags indicating what it was about the input data
-     * that caused this charset to be considered as a possible match.
-     * The result is a bitfield containing zero or more of the flags
-     * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
-     * A result of zero means no information is available.
-     * <p>
-     * Note: currently, this method always returns zero.
-     * <p>
-     *
-     * @return the type of match found for this charset.
-     *
-     * @draft ICU 3.4
-     * @provisional This API might change or be removed in a future release.
-     */
-    public int getMatchType() {
-//      TODO: create a list of enum-like constants for common combinations of types of matches.
-        return 0;
-    }
-
-    /**
      * Get the name of the detected charset.
      * The name will be one that can be used with other APIs on the
      * platform that accept charset names.  It is the "Canonical name"
@@ -207,40 +190,38 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
      * charsets that are registered with the IANA charset registry,
      * this is the MIME-preferred registerd name.
      *
-     * @see java.nio.charset.Charset
-     * @see java.io.InputStreamReader
-     *
      * @return The name of the charset.
-     *
      * @stable ICU 3.4
+     * @see java.nio.charset.Charset
+     * @see InputStreamReader
      */
     public String getName() {
-        return fRecognizer.getName();
+        return fCharsetName;
     }
+    //   gave us a byte array.
 
     /**
      * Get the ISO code for the language of the detected charset.
      *
      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
-     *
      * @stable ICU 3.4
      */
     public String getLanguage() {
-        return fRecognizer.getLanguage();
+        return fLang;
     }
 
     /**
      * Compare to other CharsetMatch objects.
      * Comparison is based on the match confidence value, which
-     *   allows CharsetDetector.detectAll() to order its results.
+     * allows CharsetDetector.detectAll() to order its results.
      *
-     * @param o the CharsetMatch object to compare against.
+     * @param other the CharsetMatch object to compare against.
      * @return a negative integer, zero, or a positive integer as the
-     *          confidence level of this CharsetMatch
-     *          is less than, equal to, or greater than that of
-     *          the argument.
+     * confidence level of this CharsetMatch
+     * is less than, equal to, or greater than that of
+     * the argument.
      * @throws ClassCastException if the argument is not a CharsetMatch.
-     * @stable ICU 3.4
+     * @stable ICU 4.4
      */
     public int compareTo(CharsetMatch other) {
         int compareResult = 0;
@@ -251,36 +232,5 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
         }
         return compareResult;
     }
-
-    /**
-     * compare this CharsetMatch to another based on confidence value
-     * @param o the CharsetMatch object to compare against
-     * @return true if equal
-     */
-    public boolean equals(Object o) {
-        if (o instanceof CharsetMatch) {
-            CharsetMatch that = (CharsetMatch) o;
-            return (this.fConfidence == that.fConfidence);
-        }
-
-        return false;
-    }
-
-    /**
-     * generates a hashCode based on the confidence value
-     * @return the hashCode
-     */
-    public int hashCode() {
-        return fConfidence;
-    }
-    //   gave us a byte array.
-
-    public String toString() {
-        String s = "Match of " + fRecognizer.getName();
-        if (fRecognizer.getLanguage() != null) {
-            s += " in " + fRecognizer.getLanguage();
-        }
-        s += " with confidence " + fConfidence;
-        return s;
-    }
-}
+    //   the recognizer during the detect operation.
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 129c9a8..d4805be 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
 *******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
+* Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@@ -11,10 +13,8 @@ package org.apache.tika.parser.txt;
  * This is a superclass for the individual detectors for
  * each of the detectable members of the ISO 2022 family
  * of encodings.
- * <p/>
+ * <p>
  * The separate classes are nested within this class.
- *
- * @internal
  */
 abstract class CharsetRecog_2022 extends CharsetRecognizer {
 
@@ -74,7 +74,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
 
         //
         // Initial quality is based on relative proportion of recongized vs.
-        //   unrecognized escape sequences. 
+        //   unrecognized escape sequences.
         //   All good:  quality = 100;
         //   half or less good: quality = 0;
         //   linear inbetween.
@@ -114,8 +114,9 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
             return "ISO-2022-JP";
         }
 
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -128,10 +129,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
             return "ISO-2022-KR";
         }
 
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
-
     }
 
     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
@@ -153,11 +154,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
             return "ISO-2022-CN";
         }
 
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
 }
-

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index 55a3957..a5100bc 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
+ * Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -8,8 +10,6 @@ package org.apache.tika.parser.txt;
 
 /**
  * Charset recognizer for UTF-8
- *
- * @internal
  */
 class CharsetRecog_UTF8 extends CharsetRecognizer {
 
@@ -20,7 +20,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
-    int match(CharsetDetector det) {
+    CharsetMatch match(CharsetDetector det) {
         boolean hasBOM = false;
         int numValid = 0;
         int numInvalid = 0;
@@ -50,10 +50,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
                 trailBytes = 3;
             } else {
                 numInvalid++;
-                if (numInvalid > 5) {
-                    break;
-                }
-                trailBytes = 0;
+                continue;
             }
 
             // Verify that we've got the right number of trail bytes in the sequence
@@ -72,7 +69,6 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
                     break;
                 }
             }
-
         }
 
         // Cook up some sort of confidence score, based on presense of a BOM
@@ -87,13 +83,15 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
         } else if (numValid > 0 && numInvalid == 0) {
             confidence = 80;
         } else if (numValid == 0 && numInvalid == 0) {
-            // Plain ASCII.  
-            confidence = 10;
+            // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+            //              accepts ASCII with confidence = 10.
+            // TODO: add plain ASCII as an explicitly detected type.
+            confidence = 15;
         } else if (numValid > numInvalid * 10) {
             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
             confidence = 25;
         }
-        return confidence;
+        return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
     }
 
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
index be6455f..a92acc1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
@@ -1,20 +1,44 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
  *******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and    *
+ * Copyright (C) 1996-2013, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
  */
+
 package org.apache.tika.parser.txt;
 
 /**
  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
  * BOM will be used if it is present.
- *
- * @internal
  */
 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
 
+    static int codeUnit16FromBytes(byte hi, byte lo) {
+        return ((hi & 0xff) << 8) | (lo & 0xff);
+    }
+
+    // UTF-16 confidence calculation. Very simple minded, but better than nothing.
+    //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
+    //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
+    //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
+    //   NULs should be rare in actual text.
+    static int adjustConfidence(int codeUnit, int confidence) {
+        if (codeUnit == 0) {
+            confidence -= 10;
+        } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
+            confidence += 10;
+        }
+        if (confidence < 0) {
+            confidence = 0;
+        } else if (confidence > 100) {
+            confidence = 100;
+        }
+        return confidence;
+    }
+
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#getName()
      */
@@ -23,22 +47,36 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
      */
-    abstract int match(CharsetDetector det);
+    abstract CharsetMatch match(CharsetDetector det);
 
     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
         String getName() {
             return "UTF-16BE";
         }
 
-        int match(CharsetDetector det) {
+        CharsetMatch match(CharsetDetector det) {
             byte[] input = det.fRawInput;
-
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
-                return 100;
+            int confidence = 10;
+
+            int bytesToCheck = Math.min(input.length, 30);
+            for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+                int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
+                if (charIndex == 0 && codeUnit == 0xFEFF) {
+                    confidence = 100;
+                    break;
+                }
+                confidence = adjustConfidence(codeUnit, confidence);
+                if (confidence == 0 || confidence == 100) {
+                    break;
+                }
             }
-
-            // TODO: Do some statistics to check for unsigned UTF-16BE
-            return 0;
+            if (bytesToCheck < 4 && confidence < 100) {
+                confidence = 0;
+            }
+            if (confidence > 0) {
+                return new CharsetMatch(det, this, confidence);
+            }
+            return null;
         }
     }
 
@@ -47,20 +85,29 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
             return "UTF-16LE";
         }
 
-        int match(CharsetDetector det) {
+        CharsetMatch match(CharsetDetector det) {
             byte[] input = det.fRawInput;
-
-            if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
-                // An LE BOM is present.
-                if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
-                    // It is probably UTF-32 LE, not UTF-16
-                    return 0;
+            int confidence = 10;
+
+            int bytesToCheck = Math.min(input.length, 30);
+            for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+                int codeUnit = codeUnit16FromBytes(input[charIndex + 1], input[charIndex]);
+                if (charIndex == 0 && codeUnit == 0xFEFF) {
+                    confidence = 100;
+                    break;
+                }
+                confidence = adjustConfidence(codeUnit, confidence);
+                if (confidence == 0 || confidence == 100) {
+                    break;
                 }
-                return 100;
             }
-
-            // TODO: Do some statistics to check for unsigned UTF-16LE
-            return 0;
+            if (bytesToCheck < 4 && confidence < 100) {
+                confidence = 0;
+            }
+            if (confidence > 0) {
+                return new CharsetMatch(det, this, confidence);
+            }
+            return null;
         }
     }
 
@@ -69,7 +116,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
 
         abstract String getName();
 
-        int match(CharsetDetector det) {
+        CharsetMatch match(CharsetDetector det) {
             byte[] input = det.fRawInput;
             int limit = (det.fRawLength / 4) * 4;
             int numValid = 0;
@@ -78,7 +125,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
             int confidence = 0;
 
             if (limit == 0) {
-                return 0;
+                return null;
             }
             if (getChar(input, 0) == 0x0000FEFF) {
                 hasBOM = true;
@@ -110,7 +157,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
                 confidence = 25;
             }
 
-            return confidence;
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -136,4 +183,4 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
             return "UTF-32LE";
         }
     }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
index 35d2b4f..3c38cd0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
  ****************************************************************************
- * Copyright (C) 2005-2008, International Business Machines Corporation and *
+ * Copyright (C) 2005-2012, International Business Machines Corporation and *
  * others. All Rights Reserved.                                             *
  ****************************************************************************
  *
@@ -20,8 +22,6 @@ import java.util.Arrays;
  * CharsetDetector class and kept in the global list of available
  * encodings to be checked.  The specific encoding being recognized
  * is determined by subclass.
- *
- * @internal
  */
 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
 
@@ -46,7 +46,8 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
      * bits 8-15: The match reason, an enum-like value.
      */
     int match(CharsetDetector det, int[] commonChars) {
-        int singleByteCharCount = 0;
+        @SuppressWarnings("unused")
+        int singleByteCharCount = 0;  //TODO Do we really need this?
         int doubleByteCharCount = 0;
         int commonCharCount = 0;
         int badCharCount = 0;
@@ -132,7 +133,7 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
      * Get the next character (however many bytes it is) from the input data
      * Subclasses for specific charset encodings must implement this function
      * to get characters according to the rules of their encoding scheme.
-     * <p/>
+     * <p>
      * This function is not a method of class iteratedChar only because
      * that would require a lot of extra derived classes, which is awkward.
      *
@@ -156,14 +157,12 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     //
     static class iteratedChar {
         int charValue = 0;             // 1-4 bytes from the raw input data
-        int index = 0;
         int nextIndex = 0;
         boolean error = false;
         boolean done = false;
 
         void reset() {
             charValue = 0;
-            index = -1;
             nextIndex = 0;
             error = false;
             done = false;
@@ -195,7 +194,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                         0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
 
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte;
             firstByte = it.charValue = it.nextByte(det);
@@ -219,8 +217,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
             return true;
         }
 
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, commonChars);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
         String getName() {
@@ -255,7 +254,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                         0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
 
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte;
             firstByte = it.charValue = it.nextByte(det);
@@ -282,8 +280,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
             return true;
         }
 
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, commonChars);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
         String getName() {
@@ -311,7 +310,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
          *     packed into an int.
          */
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte = 0;
             int secondByte = 0;
@@ -392,8 +390,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 return "EUC-JP";
             }
 
-            int match(CharsetDetector det) {
-                return match(det, commonChars);
+            CharsetMatch match(CharsetDetector det) {
+                int confidence = match(det, commonChars);
+                return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
             }
 
             public String getLanguage() {
@@ -425,8 +424,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
                 return "EUC-KR";
             }
 
-            int match(CharsetDetector det) {
-                return match(det, commonChars);
+            CharsetMatch match(CharsetDetector det) {
+                int confidence = match(det, commonChars);
+                return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
             }
 
             public String getLanguage() {
@@ -462,7 +462,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
          *     packed into an int.
          */
         boolean nextChar(iteratedChar it, CharsetDetector det) {
-            it.index = it.nextIndex;
             it.error = false;
             int firstByte = 0;
             int secondByte = 0;
@@ -519,8 +518,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
             return "GB18030";
         }
 
-        int match(CharsetDetector det) {
-            return match(det, commonChars);
+        CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, commonChars);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
         public String getLanguage() {
@@ -529,4 +529,4 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     }
 
 
-}
+}
\ No newline at end of file

[2/3] tika git commit: TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
index 87f831b..32824be 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
@@ -1,57 +1,37 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /*
  ****************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * Copyright (C) 2005-2013, International Business Machines Corporation and *
  * others. All Rights Reserved.                                             *
  ************************************************************************** *
  *
  */
-package org.apache.tika.parser.txt;
 
-import java.nio.ByteBuffer;
+package org.apache.tika.parser.txt;
 
 /**
  * This class recognizes single-byte encodings. Because the encoding scheme is so
  * simple, language statistics are used to do the matching.
- * <p/>
- * The Recognizer works by first mapping from bytes in the encoding under test
- * into that Recognizer's ngram space. Normally this means performing a
- * lowercase, and excluding codepoints that don't correspond to numbers of
- * letters. (Accented letters may or may not be ignored or normalised, depending
- * on the needs of the ngrams)
- * Then, ngram analysis is run against the transformed text, and a confidence
- * is calculated.
- * <p/>
- * For many of our Recognizers, we have one ngram set per language in each
- * encoding, and do a simultanious language+charset detection.
- * <p/>
- * When adding new Recognizers, the easiest way is to byte map to an existing
- * encoding for which we have ngrams, excluding non text, and re-use the ngrams.
- *
- * @internal
  */
 abstract class CharsetRecog_sbcs extends CharsetRecognizer {
 
-    protected boolean haveC1Bytes = false;
-
     /* (non-Javadoc)
      * @see com.ibm.icu.text.CharsetRecognizer#getName()
      */
     abstract String getName();
 
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
-     */
-    abstract int match(CharsetDetector det);
-
     int match(CharsetDetector det, int[] ngrams, byte[] byteMap) {
         return match(det, ngrams, byteMap, (byte) 0x20);
     }
 
     int match(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) {
         NGramParser parser = new NGramParser(ngrams, byteMap);
+        return parser.parse(det, spaceChar);
+    }
 
-        haveC1Bytes = det.fC1Bytes;
-
+    int matchIBM420(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) {
+        NGramParser_IBM420 parser = new NGramParser_IBM420(ngrams, byteMap);
         return parser.parse(det, spaceChar);
     }
 
@@ -59,17 +39,14 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         //        private static final int N_GRAM_SIZE = 3;
         private static final int N_GRAM_MASK = 0xFFFFFF;
 
-        private int byteIndex = 0;
+        protected int byteIndex = 0;
+        protected byte[] byteMap;
+        protected byte spaceChar;
         private int ngram = 0;
-
         private int[] ngramList;
-        private byte[] byteMap;
-
         private int ngramCount;
         private int hitCount;
 
-        private byte spaceChar;
-
         public NGramParser(int[] theNgramList, byte[] theByteMap) {
             ngramList = theNgramList;
             byteMap = theByteMap;
@@ -129,7 +106,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
 
         }
 
-        private void addByte(int b) {
+        protected void addByte(int b) {
             ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
             lookup(ngram);
         }
@@ -142,14 +119,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return det.fInputBytes[byteIndex++] & 0xFF;
         }
 
-        public int parse(CharsetDetector det) {
-            return parse(det, (byte) 0x20);
-        }
-
-        public int parse(CharsetDetector det, byte spaceCh) {
+        protected void parseCharacters(CharsetDetector det) {
             int b;
             boolean ignoreSpace = false;
-            this.spaceChar = spaceCh;
 
             while ((b = nextByte(det)) >= 0) {
                 byte mb = byteMap[b];
@@ -161,13 +133,21 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
                     }
 
                     ignoreSpace = (mb == spaceChar);
-                } else if (mb == 0 && b != 0) {
-                    // Indicates an invalid character in the charset
-                    // Bump the ngram count up a bit to indicate uncertainty
-                    ngramCount += 4;
                 }
             }
 
+        }
+
+        public int parse(CharsetDetector det) {
+            return parse(det, (byte) 0x20);
+        }
+
+        public int parse(CharsetDetector det, byte spaceCh) {
+
+            this.spaceChar = spaceCh;
+
+            parseCharacters(det);
+
             // TODO: Is this OK? The buffer could have ended in the middle of a word...
             addByte(spaceChar);
 
@@ -187,218 +167,262 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         }
     }
 
-    abstract static class CharsetRecog_8859_1 extends CharsetRecog_sbcs {
-        protected static byte[] byteMap = {
-/* 0x00-0x07 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x10-0x17 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x18-0x1f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x20-0x27 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, 
-/* 0x28-0x2f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x30-0x37 */ (byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37, 
-/* 0x38-0x3f */ (byte) 0x38, (byte) 0x39, (byte) 0x40, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x40-0x47 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, 
-/* 0x48-0x4f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, 
-/* 0x50-0x57 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, 
-/* 0x58-0x0f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x60-0x67 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, 
-/* 0x68-0x6f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, 
-/* 0x70-0x77 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, 
-/* 0x78-0x7f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x80-0x87 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x88-0x8f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x90-0x97 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0x98-0x9f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xa0-0xa7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xa8-0xaf */ (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xb0-0xb7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20, 
-/* 0xb8-0xbf */ (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, 
-/* 0xc0-0xc7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, 
-/* 0xc8-0xcf */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, 
-/* 0xd0-0xd7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, 
-/* 0xd8-0xdf */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF, 
-/* 0xe0-0xe7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, 
-/* 0xe8-0xef */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, 
-/* 0xf0-0xf7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, 
-/* 0xf8-0xff */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
+    static class NGramParser_IBM420 extends NGramParser {
+        protected static byte[] unshapeMap = {
+/*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
+/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 4- */    (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
+/* 5- */    (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
+/* 6- */    (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+/* 7- */    (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
+/* 8- */    (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x80, (byte) 0x8B, (byte) 0x8B, (byte) 0x8D, (byte) 0x8D, (byte) 0x8F,
+/* 9- */    (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9E, (byte) 0x9E,
+/* A- */    (byte) 0x9E, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x9E, (byte) 0xAB, (byte) 0xAB, (byte) 0xAD, (byte) 0xAD, (byte) 0xAF,
+/* B- */    (byte) 0xAF, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9, (byte) 0xB1, (byte) 0xBB, (byte) 0xBB, (byte) 0xBD, (byte) 0xBD, (byte) 0xBF,
+/* C- */    (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xBF, (byte) 0xCC, (byte) 0xBF, (byte) 0xCE, (byte) 0xCF,
+/* D- */    (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDA, (byte) 0xDC, (byte) 0xDC, (byte) 0xDC, (byte) 0xDF,
+/* E- */    (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+/* F- */    (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
         };
+        private byte alef = 0x00;
 
-        public String getName() {
-            return haveC1Bytes ? "windows-1252" : "ISO-8859-1";
-        }
-    }
-
-    static class CharsetRecog_8859_1_da extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
-                0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
-                0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
-                0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
-        };
 
-        public String getLanguage() {
-            return "da";
+        public NGramParser_IBM420(int[] theNgramList, byte[] theByteMap) {
+            super(theNgramList, theByteMap);
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        private byte isLamAlef(byte b) {
+            if (b == (byte) 0xb2 || b == (byte) 0xb3) {
+                return (byte) 0x47;
+            } else if (b == (byte) 0xb4 || b == (byte) 0xb5) {
+                return (byte) 0x49;
+            } else if (b == (byte) 0xb8 || b == (byte) 0xb9) {
+                return (byte) 0x56;
+            } else
+                return (byte) 0x00;
         }
-    }
-
-    static class CharsetRecog_8859_1_de extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
-                0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
-                0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
-                0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
-        };
 
-        public String getLanguage() {
-            return "de";
-        }
+        /*
+         * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
+         * because CharsetDetector is dealing with bytes not Unicode code points. We could
+         * convert the bytes to Unicode code points but that would leave us dependent
+         * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
+         * of JDK can produce different results and therefore is also avoided.
+         */
+        private int nextByte(CharsetDetector det) {
+            if (byteIndex >= det.fInputLen || det.fInputBytes[byteIndex] == 0) {
+                return -1;
+            }
+            int next;
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
+            alef = isLamAlef(det.fInputBytes[byteIndex]);
+            if (alef != (byte) 0x00)
+                next = 0xB1 & 0xFF;
+            else
+                next = unshapeMap[det.fInputBytes[byteIndex] & 0xFF] & 0xFF;
 
-    static class CharsetRecog_8859_1_en extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
-                0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
-                0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
-                0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
-        };
+            byteIndex++;
 
-        public String getLanguage() {
-            return "en";
+            return next;
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
+        protected void parseCharacters(CharsetDetector det) {
+            int b;
+            boolean ignoreSpace = false;
 
-    static class CharsetRecog_8859_1_es extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
-                0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
-                0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
-                0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
-        };
+            while ((b = nextByte(det)) >= 0) {
+                byte mb = byteMap[b];
 
-        public String getLanguage() {
-            return "es";
-        }
+                // TODO: 0x20 might not be a space in all character sets...
+                if (mb != 0) {
+                    if (!(mb == spaceChar && ignoreSpace)) {
+                        addByte(mb);
+                    }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
+                    ignoreSpace = (mb == spaceChar);
+                }
+                if (alef != (byte) 0x00) {
+                    mb = byteMap[alef & 0xFF];
 
-    static class CharsetRecog_8859_1_fr extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
-                0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
-                0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
-                0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
-        };
+                    // TODO: 0x20 might not be a space in all character sets...
+                    if (mb != 0) {
+                        if (!(mb == spaceChar && ignoreSpace)) {
+                            addByte(mb);
+                        }
 
-        public String getLanguage() {
-            return "fr";
-        }
+                        ignoreSpace = (mb == spaceChar);
+                    }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+                }
+            }
         }
     }
 
-    static class CharsetRecog_8859_1_it extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
-                0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
-                0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
-                0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
-        };
-
-        public String getLanguage() {
-            return "it";
-        }
+    static class NGramsPlusLang {
+        int[] fNGrams;
+        String fLang;
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        NGramsPlusLang(String la, int[] ng) {
+            fLang = la;
+            fNGrams = ng;
         }
     }
 
-    static class CharsetRecog_8859_1_nl extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
-                0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
-                0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
-                0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
+    static class CharsetRecog_8859_1 extends CharsetRecog_sbcs {
+        protected static byte[] byteMap = {
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
+                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
+                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
+                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
+                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
+                (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
+                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
+                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
+                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
+                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
+                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
         };
 
-        public String getLanguage() {
-            return "nl";
-        }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
+        private static NGramsPlusLang[] ngrams_8859_1 = new NGramsPlusLang[]{
+                new NGramsPlusLang(
+                        "da",
+                        new int[]{
+                                0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
+                                0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
+                                0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
+                                0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
+                        }),
+                new NGramsPlusLang(
+                        "de",
+                        new int[]{
+                                0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
+                                0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
+                                0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
+                                0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
+                        }),
+                new NGramsPlusLang(
+                        "en",
+                        new int[]{
+                                0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
+                                0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
+                                0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
+                                0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
+                        }),
+
+                new NGramsPlusLang(
+                        "es",
+                        new int[]{
+                                0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
+                                0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
+                                0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
+                                0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
+                        }),
+
+                new NGramsPlusLang(
+                        "fr",
+                        new int[]{
+                                0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
+                                0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
+                                0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
+                                0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
+                        }),
+
+                new NGramsPlusLang(
+                        "it",
+                        new int[]{
+                                0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
+                                0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
+                                0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
+                                0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
+                        }),
+
+                new NGramsPlusLang(
+                        "nl",
+                        new int[]{
+                                0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
+                                0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
+                                0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
+                                0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
+                        }),
+
+                new NGramsPlusLang(
+                        "no",
+                        new int[]{
+                                0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
+                                0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
+                                0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
+                                0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
+                        }),
+
+                new NGramsPlusLang(
+                        "pt",
+                        new int[]{
+                                0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
+                                0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
+                                0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
+                                0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
+
+                        }),
+
+                new NGramsPlusLang(
+                        "sv",
+                        new int[]{
+                                0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
+                                0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
+                                0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
+                                0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
+                        }),
 
-    static class CharsetRecog_8859_1_no extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
-                0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
-                0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
-                0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
         };
 
-        public String getLanguage() {
-            return "no";
-        }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            String name = det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
+            int bestConfidenceSoFar = -1;
+            String lang = null;
+            for (NGramsPlusLang ngl : ngrams_8859_1) {
+                int confidence = match(det, ngl.fNGrams, byteMap);
+                if (confidence > bestConfidenceSoFar) {
+                    bestConfidenceSoFar = confidence;
+                    lang = ngl.fLang;
+                }
+            }
+            return bestConfidenceSoFar <= 0 ? null : new CharsetMatch(det, this, bestConfidenceSoFar, name, lang);
         }
-    }
 
-    static class CharsetRecog_8859_1_pt extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
-                0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
-                0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
-                0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
-        };
 
-        public String getLanguage() {
-            return "pt";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public String getName() {
+            return "ISO-8859-1";
         }
     }
 
-    static class CharsetRecog_8859_1_sv extends CharsetRecog_8859_1 {
-        private static int[] ngrams = {
-                0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
-                0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
-                0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
-                0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
-        };
-
-        public String getLanguage() {
-            return "sv";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
 
-    abstract static class CharsetRecog_8859_2 extends CharsetRecog_sbcs {
+    static class CharsetRecog_8859_2 extends CharsetRecog_sbcs {
         protected static byte[] byteMap = {
                 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
                 (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
@@ -434,78 +458,61 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
                 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
         };
 
-        public String getName() {
-            return haveC1Bytes ? "windows-1250" : "ISO-8859-2";
-        }
-    }
-
-    static class CharsetRecog_8859_2_cs extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
-                0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
-                0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
-                0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
-        };
-
-        public String getLanguage() {
-            return "cs";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_8859_2_hu extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
-                0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
-                0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
-                0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
+        private static NGramsPlusLang[] ngrams_8859_2 = new NGramsPlusLang[]{
+                new NGramsPlusLang(
+                        "cs",
+                        new int[]{
+                                0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
+                                0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
+                                0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
+                                0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
+                        }),
+                new NGramsPlusLang(
+                        "hu",
+                        new int[]{
+                                0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
+                                0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
+                                0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
+                                0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
+                        }),
+                new NGramsPlusLang(
+                        "pl",
+                        new int[]{
+                                0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
+                                0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
+                                0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
+                                0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
+                        }),
+                new NGramsPlusLang(
+                        "ro",
+                        new int[]{
+                                0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
+                                0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
+                                0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
+                                0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
+                        })
         };
 
-        public String getLanguage() {
-            return "hu";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            String name = det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
+            int bestConfidenceSoFar = -1;
+            String lang = null;
+            for (NGramsPlusLang ngl : ngrams_8859_2) {
+                int confidence = match(det, ngl.fNGrams, byteMap);
+                if (confidence > bestConfidenceSoFar) {
+                    bestConfidenceSoFar = confidence;
+                    lang = ngl.fLang;
+                }
+            }
+            return bestConfidenceSoFar <= 0 ? null : new CharsetMatch(det, this, bestConfidenceSoFar, name, lang);
         }
-    }
 
-    static class CharsetRecog_8859_2_pl extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
-                0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
-                0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
-                0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
-        };
-
-        public String getLanguage() {
-            return "pl";
+        public String getName() {
+            return "ISO-8859-2";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
     }
 
-    static class CharsetRecog_8859_2_ro extends CharsetRecog_8859_2 {
-        private static int[] ngrams = {
-                0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
-                0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
-                0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
-                0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
-        };
-
-        public String getLanguage() {
-            return "ro";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
 
     abstract static class CharsetRecog_8859_5 extends CharsetRecog_sbcs {
         protected static byte[] byteMap = {
@@ -560,8 +567,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "ru";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -618,8 +626,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "ar";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -660,7 +669,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         };
 
         public String getName() {
-            return haveC1Bytes ? "windows-1253" : "ISO-8859-7";
+            return "ISO-8859-7";
         }
     }
 
@@ -676,8 +685,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "el";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            String name = det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "el");
         }
     }
 
@@ -718,7 +729,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         };
 
         public String getName() {
-            return haveC1Bytes ? "windows-1255" : "ISO-8859-8";
+            return "ISO-8859-8";
         }
     }
 
@@ -731,15 +742,17 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         };
 
         public String getName() {
-            return haveC1Bytes ? "windows-1255" : /*"ISO-8859-8-I"*/ "ISO-8859-8";
+            return "ISO-8859-8-I";
         }
 
         public String getLanguage() {
             return "he";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I";
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
         }
     }
 
@@ -755,8 +768,11 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "he";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8";
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
+
         }
     }
 
@@ -797,7 +813,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         };
 
         public String getName() {
-            return haveC1Bytes ? "windows-1254" : "ISO-8859-9";
+            return "ISO-8859-9";
         }
     }
 
@@ -813,8 +829,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "tr";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            String name = det.fC1Bytes ? "windows-1254" : "ISO-8859-9";
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "tr");
         }
     }
 
@@ -869,65 +887,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "ru";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
-        }
-    }
-
-    static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs {
-        private static int[] ngrams = {
-                0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
-                0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
-                0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
-                0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
-        };
-
-        // bytemap converts cp866 chars to cp1251 chars, so ngrams are still unchanged
-        private static byte[] byteMap = {
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-                (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-                (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-                (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-                (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-                (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-                (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
-                (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-                (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2,
-                (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-        };
-
-        public String getName() {
-            return "IBM866";
-        }
-
-        public String getLanguage() {
-            return "ru";
-        }
-
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -982,8 +944,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "ar";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -1038,29 +1001,30 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "ru";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
     abstract static class CharsetRecog_IBM424_he extends CharsetRecog_sbcs {
         protected static byte[] byteMap = {
 /*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
-/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 4- */    (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 5- */    (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 6- */    (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 7- */    (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40, 
-/* 8- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* 9- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* A- */    (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* B- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* C- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* D- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
-/* E- */    (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, 
+/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 4- */    (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 5- */    (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 6- */    (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 7- */    (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40,
+/* 8- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 9- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* A- */    (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* B- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* C- */    (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* D- */    (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* E- */    (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
 /* F- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
         };
 
@@ -1081,8 +1045,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "IBM424_rtl";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap, (byte) 0x40);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap, (byte) 0x40);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
@@ -1099,12 +1064,14 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "IBM424_ltr";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, ngrams, byteMap, (byte) 0x40);
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, ngrams, byteMap, (byte) 0x40);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
     }
 
     abstract static class CharsetRecog_IBM420_ar extends CharsetRecog_sbcs {
+
         protected static byte[] byteMap = {
 /*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
 /* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
@@ -1124,85 +1091,12 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
 /* E- */    (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xEA, (byte) 0xEB, (byte) 0x40, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
 /* F- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x40,
         };
-        protected static byte[] unshapeMap = {
-/*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
-/* 0- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */    (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */    (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
-/* 5- */    (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
-/* 6- */    (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 7- */    (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
-/* 8- */    (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x80, (byte) 0x8B, (byte) 0x8B, (byte) 0x8D, (byte) 0x8D, (byte) 0x8F,
-/* 9- */    (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9E, (byte) 0x9E,
-/* A- */    (byte) 0x9E, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x9E, (byte) 0xAB, (byte) 0xAB, (byte) 0xAD, (byte) 0xAD, (byte) 0xAF,
-/* B- */    (byte) 0xAF, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9, (byte) 0xB1, (byte) 0xBB, (byte) 0xBB, (byte) 0xBD, (byte) 0xBD, (byte) 0xBF,
-/* C- */    (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xBF, (byte) 0xCC, (byte) 0xBF, (byte) 0xCE, (byte) 0xCF,
-/* D- */    (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDA, (byte) 0xDC, (byte) 0xDC, (byte) 0xDC, (byte) 0xDF,
-/* E- */    (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* F- */    (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
-        };
-        //arabic shaping class, method shape/unshape
-        //protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
-        protected byte[] prev_fInputBytes = null;
+
 
         public String getLanguage() {
             return "ar";
         }
 
-        protected void matchInit(CharsetDetector det) {
-            prev_fInputBytes = det.fInputBytes.clone();
-            byte bb[] = unshape(det.fInputBytes);
-            det.setText(bb);
-        }
-
-        /*
-         * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
-         * because CharsetDetector is dealing with bytes not Unicode code points. We could
-         * convert the bytes to Unicode code points but that would leave us dependent
-         * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
-         * of JDK can produce different results and therefore is also avoided.
-         */
-        private byte[] unshape(byte[] inputBytes) {
-            byte resultByteArr[] = unshapeLamAlef(inputBytes);
-
-            for (int i = 0; i < inputBytes.length; i++) {
-                resultByteArr[i] = unshapeMap[resultByteArr[i] & 0xFF];
-            }
-            return resultByteArr;
-        }
-
-        private byte[] unshapeLamAlef(byte[] inputBytes) {
-            ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputBytes.length * 2);
-            ByteBuffer resultBuffer;
-            byte unshapedLamAlef[] = {(byte) 0xb1, (byte) 0x56};
-
-            for (byte inputByte : inputBytes) {
-                if (isLamAlef(inputByte))
-                    resultBigBuffer.put(unshapedLamAlef);
-                else
-                    resultBigBuffer.put(inputByte);
-            }
-            resultBuffer = ByteBuffer.allocate(resultBigBuffer.position());
-            resultBuffer.put(resultBigBuffer.array(), 0, resultBigBuffer.position());
-            return resultBuffer.array();
-        }
-
-        private boolean isLamAlef(byte b) {
-            // Return true if byte is any of these:
-            //
-            //   {(byte)0xb2,(byte)0xb3,(byte)0xb4,(byte)0xb5,(byte)0xb7,(byte)0xb8}
-            // 
-            // NOTE: 0xb2 is -78; 0xb8 is -72:
-            return (b <= (byte) 0xb8) && (b >= (byte) 0xb2) && (b != (byte) 0xb6);
-        }
-
-        protected void matchFinish(CharsetDetector det) {
-            if (prev_fInputBytes != null)
-                det.setText(prev_fInputBytes);
-        }
-
     }
 
     static class CharsetRecog_IBM420_ar_rtl extends CharsetRecog_IBM420_ar {
@@ -1217,11 +1111,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "IBM420_rtl";
         }
 
-        public int match(CharsetDetector det) {
-            matchInit(det);
-            int result = match(det, ngrams, byteMap, (byte) 0x40);
-            matchFinish(det);
-            return result;
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = matchIBM420(det, ngrams, byteMap, (byte) 0x40);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
 
     }
@@ -1238,19 +1130,19 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
             return "IBM420_ltr";
         }
 
-        public int match(CharsetDetector det) {
-            matchInit(det);
-            int result = match(det, ngrams, byteMap, (byte) 0x40);
-            matchFinish(det);
-            return result;
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = matchIBM420(det, ngrams, byteMap, (byte) 0x40);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
         }
+
     }
 
     static abstract class CharsetRecog_EBCDIC_500 extends CharsetRecog_sbcs {
+
         // This maps EBCDIC 500 codepoints onto either space (not of interest), or a lower
         //  case ISO_8859_1 number/letter/accented-letter codepoint for ngram matching
         // Because we map to ISO_8859_1, we can re-use the ngrams from those detectors
-        // To avoid mis-detection, we skip many of the control characters in the 0x00-0x3f range 
+        // To avoid mis-detection, we skip many of the control characters in the 0x00-0x3f range
         protected static byte[] byteMap = {
 /* 0x00-0x07 */ (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00,
 /* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
@@ -1285,69 +1177,106 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
 /* 0xf0-0xf7 */ (byte) '0', (byte) '1', (byte) '2', (byte) '3', (byte) '4', (byte) '5', (byte) '6', (byte) '7',
 /* 0xf8-0xff */ (byte) '8', (byte) '9', (byte) 0x20, (byte) 0xfb, (byte) 0xfc, (byte) 0xf9, (byte) 0xfa, (byte) 0x20,
         };
+        private final int langIndex;
+
+        protected CharsetRecog_EBCDIC_500(int langIndex) {
+            this.langIndex = langIndex;
+        }
+
+        /**
+         * @param lang language to find
+         * @return the index into CharsetRecog_8859_1.ngrams_8859_1 that matches his language;
+         * throws IllegalArgumentException if language can't be found
+         */
+        static int findLangIndex(String lang) {
+            for (int i = 0; i < CharsetRecog_8859_1.ngrams_8859_1.length; i++) {
+                NGramsPlusLang ngpl = CharsetRecog_8859_1.ngrams_8859_1[i];
+                if (ngpl.fLang.equals(lang)) {
+                    return i;
+                }
+            }
+            throw new IllegalArgumentException("can't find language: " + lang);
+        }
 
         public String getName() {
             return "IBM500";
         }
+
+        public CharsetMatch match(CharsetDetector det) {
+            int confidence = match(det, CharsetRecog_8859_1.ngrams_8859_1[getLangIndex()].fNGrams, byteMap);
+            return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
+        }
+
+        int getLangIndex() {
+            return langIndex;
+        }
+
+
     }
 
+    //The EBCDIC codes were removed from ICU4js trunk as of at least July 26, 2016.
     static class CharsetRecog_EBCDIC_500_en extends CharsetRecog_EBCDIC_500 {
+
+        CharsetRecog_EBCDIC_500_en() {
+            super(findLangIndex("en"));
+        }
+
         public String getLanguage() {
             return "en";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, CharsetRecog_8859_1_en.ngrams, byteMap);
-        }
+
     }
 
     static class CharsetRecog_EBCDIC_500_de extends CharsetRecog_EBCDIC_500 {
+        CharsetRecog_EBCDIC_500_de() {
+            super(findLangIndex("de"));
+        }
+
         public String getLanguage() {
             return "de";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, CharsetRecog_8859_1_de.ngrams, byteMap);
-        }
     }
 
     static class CharsetRecog_EBCDIC_500_fr extends CharsetRecog_EBCDIC_500 {
-        public String getLanguage() {
-            return "fr";
+        CharsetRecog_EBCDIC_500_fr() {
+            super(findLangIndex("fr"));
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, CharsetRecog_8859_1_fr.ngrams, byteMap);
+        public String getLanguage() {
+            return "fr";
         }
     }
 
     static class CharsetRecog_EBCDIC_500_es extends CharsetRecog_EBCDIC_500 {
-        public String getLanguage() {
-            return "es";
+        CharsetRecog_EBCDIC_500_es() {
+            super(findLangIndex("es"));
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, CharsetRecog_8859_1_es.ngrams, byteMap);
+        public String getLanguage() {
+            return "es";
         }
     }
 
     static class CharsetRecog_EBCDIC_500_it extends CharsetRecog_EBCDIC_500 {
-        public String getLanguage() {
-            return "it";
+        CharsetRecog_EBCDIC_500_it() {
+            super(findLangIndex("it"));
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, CharsetRecog_8859_1_it.ngrams, byteMap);
+        public String getLanguage() {
+            return "it";
         }
     }
 
     static class CharsetRecog_EBCDIC_500_nl extends CharsetRecog_EBCDIC_500 {
+        CharsetRecog_EBCDIC_500_nl() {
+            super(findLangIndex("nl"));
+        }
+
         public String getLanguage() {
             return "nl";
         }
 
-        public int match(CharsetDetector det) {
-            return match(det, CharsetRecog_8859_1_nl.ngrams, byteMap);
-        }
     }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
index e1a0ff0..7834053 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
 /**
  * ******************************************************************************
- * Copyright (C) 2005, International Business Machines Corporation and         *
+ * Copyright (C) 2005-2012, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  * ******************************************************************************
  */
@@ -9,28 +11,28 @@ package org.apache.tika.parser.txt;
 /**
  * Abstract class for recognizing a single charset.
  * Part of the implementation of ICU's CharsetDetector.
- *
+ * <p>
  * Each specific charset that can be recognized will have an instance
  * of some subclass of this class.  All interaction between the overall
  * CharsetDetector and the stuff specific to an individual charset happens
  * via the interface provided here.
- *
- * Instances of CharsetDetector DO NOT have or maintain 
+ * <p>
+ * Instances of CharsetDetector DO NOT have or maintain
  * state pertaining to a specific match or detect operation.
  * The WILL be shared by multiple instances of CharsetDetector.
  * They encapsulate const charset-specific information.
- *
- * @internal
  */
 abstract class CharsetRecognizer {
     /**
      * Get the IANA name of this charset.
+     *
      * @return the charset name.
      */
     abstract String getName();
 
     /**
      * Get the ISO language code for this charset.
+     *
      * @return the language code, or <code>null</code> if the language cannot be determined.
      */
     public String getLanguage() {
@@ -39,16 +41,13 @@ abstract class CharsetRecognizer {
 
     /**
      * Test the match of this charset with the input text data
-     *      which is obtained via the CharsetDetector object.
+     * which is obtained via the CharsetDetector object.
      *
-     * @param det  The CharsetDetector, which contains the input text
-     *             to be checked for being in this charset.
-     * @return Two values packed into one int  (Damn java, anyhow)
-     *             <br/>
-     *             bits 0-7:  the match confidence, ranging from 0-100
-     *             <br/>
-     *             bits 8-15: The match reason, an enum-like value.
+     * @param det The CharsetDetector, which contains the input text
+     *            to be checked for being in this charset.
+     * @return A CharsetMatch object containing details of match
+     * with this charset, or null if there was no match.
      */
-    abstract int match(CharsetDetector det);
+    abstract CharsetMatch match(CharsetDetector det);
 
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
index ce792dc..58ba1ac 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -25,6 +25,14 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.utils.CharsetUtils;
 
+/**
+ * Tika wrapper around ICU4J's CharsetDetector.
+ * <p>
+ * NOTE: ICU4J's CharsetDetector and required classes were
+ * copied from ICU4J with two modifications:
+ * Apache Tika added the EBCDIC-500 family of detectors, and
+ * we increased the buffer to 12000 bytes.
+ */
 public class Icu4jEncodingDetector implements EncodingDetector {
 
     public Charset detect(InputStream input, Metadata metadata)