You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:15 UTC
[09/39] tika git commit: Convert new lines from windows to unix

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 77773e0..f9df9e0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,544 +1,544 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-
-
-/**
- * <code>CharsetDetector</code> provides a facility for detecting the
- * charset or encoding of character data in an unknown format.
- * The input data can either be from an input stream or an array of bytes.
- * The result of the detection operation is a list of possibly matching
- * charsets, or, for simple use, you can just ask for a Java Reader that
- * will will work over the input data.
- * <p/>
- * Character set detection is at best an imprecise operation.  The detection
- * process will attempt to identify the charset that best matches the characteristics
- * of the byte data, but the process is partly statistical in nature, and
- * the results can not be guaranteed to always be correct.
- * <p/>
- * For best accuracy in charset detection, the input data should be primarily
- * in a single language, and a minimum of a few hundred bytes worth of plain text
- * in the language are needed.  The detection process will attempt to
- * ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
- * @stable ICU 3.4
- */
-public class CharsetDetector {
-
-//   Question: Should we have getters corresponding to the setters for input text
-//   and declared encoding?
-
-//   A thought: If we were to create our own type of Java Reader, we could defer
-//   figuring out an actual charset for data that starts out with too much English
-//   only ASCII until the user actually read through to something that didn't look
-//   like 7 bit English.  If  nothing else ever appeared, we would never need to
-//   actually choose the "real" charset.  All assuming that the application just
-//   wants the data, and doesn't care about a char set name.
-
-    private static final int kBufSize = 12000;
-    private static final int MAX_CONFIDENCE = 100;
-    private static String[] fCharsetNames;
-    /*
-     * List of recognizers for all charsets known to the implementation.
-     */
-    private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
-    /*
-     *  The following items are accessed by individual CharsetRecongizers during
-     *     the recognition process
-     *
-     */
-    byte[] fInputBytes =       // The text to be checked.  Markup will have been
-            new byte[kBufSize];  //   removed if appropriate.
-    int fInputLen;          // Length of the byte data in fInputText.
-    short fByteStats[] =      // byte frequency statistics for the input text.
-            new short[256];  //   Value is percent, not absolute.
-    boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
-            false;
-    String fDeclaredEncoding;
-    //
-    //  Stuff private to CharsetDetector
-    //
-    byte[] fRawInput;     // Original, untouched input bytes.
-    //  If user gave us a byte array, this is it.
-    //  If user gave us a stream, it's read to a
-    //  buffer here.
-    int fRawLength;    // Length of data in fRawInput array.
-    InputStream fInputStream;  // User's input stream, or null if the user
-    boolean fStripTags =   // If true, setText() will strip tags from input text.
-            false;
-
-    /**
-     *   Constructor
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetDetector() {
-    }
-
-    /**
-     * Get the names of all char sets that can be recognized by the char set detector.
-     *
-     * @return an array of the names of all charsets that can be recognized
-     * by the charset detector.
-     *
-     * @stable ICU 3.4
-     */
-    public static String[] getAllDetectableCharsets() {
-        return fCharsetNames;
-    }
-
-    /*
-     * Create the singleton instances of the CharsetRecognizer classes
-     */
-    private static ArrayList<CharsetRecognizer> createRecognizers() {
-        ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
-        recognizers.add(new CharsetRecog_UTF8());
-
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
-        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
-        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
-        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
-        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
-        // Create an array of all charset names, as a side effect.
-        // Needed for the getAllDetectableCharsets() API.
-        String[] charsetNames = new String[recognizers.size()];
-        int out = 0;
-
-        for (CharsetRecognizer recognizer : recognizers) {
-            String name = recognizer.getName();
-
-            if (out == 0 || !name.equals(charsetNames[out - 1])) {
-                charsetNames[out++] = name;
-            }
-        }
-
-        fCharsetNames = new String[out];
-        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
-        return recognizers;
-    }
-
-    /**
-     * Set the declared encoding for charset detection.
-     *  The declared encoding of an input text is an encoding obtained
-     *  from an http header or xml declaration or similar source that
-     *  can be provided as additional information to the charset detector.
-     *  A match between a declared encoding and a possible detected encoding
-     *  will raise the quality of that detected encoding by a small delta,
-     *  and will also appear as a "reason" for the match.
-     * <p/>
-     * A declared encoding that is incompatible with the input data being
-     * analyzed will not be added to the list of possible encodings.
-     *
-     *  @param encoding The declared encoding
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetDetector setDeclaredEncoding(String encoding) {
-        setCanonicalDeclaredEncoding(encoding);
-        return this;
-    }
-
-    /**
-     * Set the input text (byte) data whose charset is to be detected.
-     *
-     * @param in the input text of unknown encoding
-     *
-     * @return This CharsetDetector
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetDetector setText(byte[] in) {
-        fRawInput = in;
-        fRawLength = in.length;
-
-        MungeInput();
-
-        return this;
-    }
-    //   Value is rounded up, so zero really means zero occurences.
-
-    /**
-     * Set the input text (byte) data whose charset is to be detected.
-     *  <p/>
-     *   The input stream that supplies the character data must have markSupported()
-     *   == true; the charset detection process will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *   be read depends on the characteristics of the data itself.
-     *
-     * @param in the input text of unknown encoding
-     *
-     * @return This CharsetDetector
-     *
-     * @stable ICU 3.4
-     */
-
-    public CharsetDetector setText(InputStream in) throws IOException {
-        fInputStream = in;
-        fInputStream.mark(kBufSize);
-        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
-        //   previous one may have come from the caller,
-        //   in which case we can't touch it.
-        fRawLength = 0;
-        int remainingLength = kBufSize;
-        while (remainingLength > 0) {
-            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
-            int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
-            if (bytesRead <= 0) {
-                break;
-            }
-            fRawLength += bytesRead;
-            remainingLength -= bytesRead;
-        }
-        fInputStream.reset();
-
-        MungeInput();                     // Strip html markup, collect byte stats.
-        return this;
-    }
-
-    /**
-     * Return the charset that best matches the supplied input data.
-     *
-     * Note though, that because the detection
-     * only looks at the start of the input data,
-     * there is a possibility that the returned charset will fail to handle
-     * the full set of input data.
-     * <p/>
-     * Raise an exception if
-     *  <ul>
-     *    <li>no charset appears to match the data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
-     *
-     * @return a CharsetMatch object representing the best matching charset, or
-     *         <code>null</code> if there are no matches.
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetMatch detect() {
-//   TODO:  A better implementation would be to copy the detect loop from
-//          detectAll(), and cut it short as soon as a match with a high confidence
-//          is found.  This is something to be done later, after things are otherwise
-//          working.
-        CharsetMatch matches[] = detectAll();
-
-        if (matches == null || matches.length == 0) {
-            return null;
-        }
-
-        return matches[0];
-    }
-
-    /**
-     *  Return an array of all charsets that appear to be plausible
-     *  matches with the input data.  The array is ordered with the
-     *  best quality match first.
-     * <p/>
-     * Raise an exception if
-     *  <ul>
-     *    <li>no charsets appear to match the input data.</li>
-     *    <li>no input text has been provided</li>
-     *  </ul>
-     *
-     * @return An array of CharsetMatch objects representing possibly matching charsets.
-     *
-     * @stable ICU 3.4
-     */
-    public CharsetMatch[] detectAll() {
-        CharsetRecognizer csr;
-        int i;
-        int detectResults;
-        int confidence;
-        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-
-        //  Iterate over all possible charsets, remember all that
-        //    give a match quality > 0.
-        for (i = 0; i < fCSRecognizers.size(); i++) {
-            csr = fCSRecognizers.get(i);
-            detectResults = csr.match(this);
-            confidence = detectResults & 0x000000ff;
-            if (confidence > 0) {
-                // Just to be safe, constrain
-                confidence = Math.min(confidence, MAX_CONFIDENCE);
-
-                // Apply charset hint.
-                if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
-                    // Reduce lack of confidence (delta between "sure" and current) by 50%.
-                    confidence += (MAX_CONFIDENCE - confidence) / 2;
-                }
-
-                CharsetMatch m = new CharsetMatch(this, csr, confidence);
-                matches.add(m);
-            }
-        }
-
-        Collections.sort(matches);      // CharsetMatch compares on confidence
-        Collections.reverse(matches);   //  Put best match first.
-        CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
-        resultArray = matches.toArray(resultArray);
-        return resultArray;
-    }
-
-    /**
-     * Autodetect the charset of an inputStream, and return a Java Reader
-     * to access the converted input data.
-     * <p/>
-     * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
-     * <p/>
-     *   For the input stream that supplies the character data, markSupported()
-     *   must be true; the  charset detection will read a small amount of data,
-     *   then return the stream to its original position via
-     *   the InputStream.reset() operation.  The exact amount that will
-     *    be read depends on the characteristics of the data itself.
-     *<p/>
-     * Raise an exception if no charsets appear to match the input data.
-     *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
-     * @stable ICU 3.4
-     */
-    public Reader getReader(InputStream in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
-
-        try {
-            setText(in);
-
-            CharsetMatch match = detect();
-
-            if (match == null) {
-                return null;
-            }
-
-            return match.getReader();
-        } catch (IOException e) {
-            return null;
-        }
-    }
-
-    /**
-     * Autodetect the charset of an inputStream, and return a String
-     * containing the converted input data.
-     * <p/>
-     * This is a convenience method that is equivalent to
-     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
-     *<p/>
-     * Raise an exception if no charsets appear to match the input data.
-     *
-     * @param in The source of the byte data in the unknown charset.
-     *
-     * @param declaredEncoding  A declared encoding for the data, if available,
-     *           or null or an empty string if none is available.
-     *
-     * @stable ICU 3.4
-     */
-    public String getString(byte[] in, String declaredEncoding) {
-        setCanonicalDeclaredEncoding(declaredEncoding);
-
-        try {
-            setText(in);
-
-            CharsetMatch match = detect();
-
-            if (match == null) {
-                return null;
-            }
-
-            return match.getString(-1);
-        } catch (IOException e) {
-            return null;
-        }
-    }
-    //   gave us a byte array.
-
-    /**
-     * Test whether or not input filtering is enabled.
-     *
-     * @return <code>true</code> if input text will be filtered.
-     *
-     * @see #enableInputFilter
-     *
-     * @stable ICU 3.4
-     */
-    public boolean inputFilterEnabled() {
-        return fStripTags;
-    }
-
-    /**
-     * Enable filtering of input text. If filtering is enabled,
-     * text within angle brackets ("<" and ">") will be removed
-     * before detection.
-     *
-     * @param filter <code>true</code> to enable input text filtering.
-     *
-     * @return The previous setting.
-     *
-     * @stable ICU 3.4
-     */
-    public boolean enableInputFilter(boolean filter) {
-        boolean previous = fStripTags;
-
-        fStripTags = filter;
-
-        return previous;
-    }
-
-    /**
-     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
-     *
-     * @param encoding - name of character encoding
-     */
-    private void setCanonicalDeclaredEncoding(String encoding) {
-        if ((encoding == null) || encoding.isEmpty()) {
-            return;
-        }
-
-        Charset cs = Charset.forName(encoding);
-        if (cs != null) {
-            fDeclaredEncoding = cs.name();
-        }
-    }
-
-    /*
-     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
-     *               it by removing what appears to be html markup.
-     */
-    private void MungeInput() {
-        int srci = 0;
-        int dsti = 0;
-        byte b;
-        boolean inMarkup = false;
-        int openTags = 0;
-        int badTags = 0;
-
-        //
-        //  html / xml markup stripping.
-        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
-        //     discard everything within < brackets >
-        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
-        //     guess as to whether the input was actually marked up at all.
-        if (fStripTags) {
-            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
-                b = fRawInput[srci];
-                if (b == (byte) '<') {
-                    if (inMarkup) {
-                        badTags++;
-                    }
-                    inMarkup = true;
-                    openTags++;
-                }
-
-                if (!inMarkup) {
-                    fInputBytes[dsti++] = b;
-                }
-
-                if (b == (byte) '>') {
-                    inMarkup = false;
-                }
-            }
-
-            fInputLen = dsti;
-        }
-
-        //
-        //  If it looks like this input wasn't marked up, or if it looks like it's
-        //    essentially nothing but markup abandon the markup stripping.
-        //    Detection will have to work on the unstripped input.
-        //
-        if (openTags < 5 || openTags / 5 < badTags ||
-                (fInputLen < 100 && fRawLength > 600)) {
-            int limit = fRawLength;
-
-            if (limit > kBufSize) {
-                limit = kBufSize;
-            }
-
-            for (srci = 0; srci < limit; srci++) {
-                fInputBytes[srci] = fRawInput[srci];
-            }
-            fInputLen = srci;
-        }
-
-        //
-        // Tally up the byte occurence statistics.
-        //   These are available for use by the various detectors.
-        //
-        Arrays.fill(fByteStats, (short) 0);
-        for (srci = 0; srci < fInputLen; srci++) {
-            int val = fInputBytes[srci] & 0x00ff;
-            fByteStats[val]++;
-        }
-
-        fC1Bytes = false;
-        for (int i = 0x80; i <= 0x9F; i += 1) {
-            if (fByteStats[i] != 0) {
-                fC1Bytes = true;
-                break;
-            }
-        }
-    }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation.  The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed.  The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @stable ICU 3.4
+ */
+public class CharsetDetector {
+
+//   Question: Should we have getters corresponding to the setters for input text
+//   and declared encoding?
+
+//   A thought: If we were to create our own type of Java Reader, we could defer
+//   figuring out an actual charset for data that starts out with too much English
+//   only ASCII until the user actually read through to something that didn't look
+//   like 7 bit English.  If  nothing else ever appeared, we would never need to
+//   actually choose the "real" charset.  All assuming that the application just
+//   wants the data, and doesn't care about a char set name.
+
+    private static final int kBufSize = 12000;
+    private static final int MAX_CONFIDENCE = 100;
+    private static String[] fCharsetNames;
+    /*
+     * List of recognizers for all charsets known to the implementation.
+     */
+    private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+    /*
+     *  The following items are accessed by individual CharsetRecongizers during
+     *     the recognition process
+     *
+     */
+    byte[] fInputBytes =       // The text to be checked.  Markup will have been
+            new byte[kBufSize];  //   removed if appropriate.
+    int fInputLen;          // Length of the byte data in fInputText.
+    short fByteStats[] =      // byte frequency statistics for the input text.
+            new short[256];  //   Value is percent, not absolute.
+    boolean fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
+            false;
+    String fDeclaredEncoding;
+    //
+    //  Stuff private to CharsetDetector
+    //
+    byte[] fRawInput;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    //  If user gave us a stream, it's read to a
+    //  buffer here.
+    int fRawLength;    // Length of data in fRawInput array.
+    InputStream fInputStream;  // User's input stream, or null if the user
+    boolean fStripTags =   // If true, setText() will strip tags from input text.
+            false;
+
+    /**
+     *   Constructor
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector() {
+    }
+
+    /**
+     * Get the names of all char sets that can be recognized by the char set detector.
+     *
+     * @return an array of the names of all charsets that can be recognized
+     * by the charset detector.
+     *
+     * @stable ICU 3.4
+     */
+    public static String[] getAllDetectableCharsets() {
+        return fCharsetNames;
+    }
+
+    /*
+     * Create the singleton instances of the CharsetRecognizer classes
+     */
+    private static ArrayList<CharsetRecognizer> createRecognizers() {
+        ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
+
+        recognizers.add(new CharsetRecog_UTF8());
+
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+        recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+        recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+        recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+        // Create an array of all charset names, as a side effect.
+        // Needed for the getAllDetectableCharsets() API.
+        String[] charsetNames = new String[recognizers.size()];
+        int out = 0;
+
+        for (CharsetRecognizer recognizer : recognizers) {
+            String name = recognizer.getName();
+
+            if (out == 0 || !name.equals(charsetNames[out - 1])) {
+                charsetNames[out++] = name;
+            }
+        }
+
+        fCharsetNames = new String[out];
+        System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+        return recognizers;
+    }
+
+    /**
+     * Set the declared encoding for charset detection.
+     *  The declared encoding of an input text is an encoding obtained
+     *  from an http header or xml declaration or similar source that
+     *  can be provided as additional information to the charset detector.
+     *  A match between a declared encoding and a possible detected encoding
+     *  will raise the quality of that detected encoding by a small delta,
+     *  and will also appear as a "reason" for the match.
+     * <p/>
+     * A declared encoding that is incompatible with the input data being
+     * analyzed will not be added to the list of possible encodings.
+     *
+     *  @param encoding The declared encoding
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector setDeclaredEncoding(String encoding) {
+        setCanonicalDeclaredEncoding(encoding);
+        return this;
+    }
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *
+     * @param in the input text of unknown encoding
+     *
+     * @return This CharsetDetector
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetDetector setText(byte[] in) {
+        fRawInput = in;
+        fRawLength = in.length;
+
+        MungeInput();
+
+        return this;
+    }
+    //   Value is rounded up, so zero really means zero occurences.
+
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *  <p/>
+     *   The input stream that supplies the character data must have markSupported()
+     *   == true; the charset detection process will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *   be read depends on the characteristics of the data itself.
+     *
+     * @param in the input text of unknown encoding
+     *
+     * @return This CharsetDetector
+     *
+     * @stable ICU 3.4
+     */
+
+    public CharsetDetector setText(InputStream in) throws IOException {
+        fInputStream = in;
+        fInputStream.mark(kBufSize);
+        fRawInput = new byte[kBufSize];   // Always make a new buffer because the
+        //   previous one may have come from the caller,
+        //   in which case we can't touch it.
+        fRawLength = 0;
+        int remainingLength = kBufSize;
+        while (remainingLength > 0) {
+            // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
+            int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+            if (bytesRead <= 0) {
+                break;
+            }
+            fRawLength += bytesRead;
+            remainingLength -= bytesRead;
+        }
+        fInputStream.reset();
+
+        MungeInput();                     // Strip html markup, collect byte stats.
+        return this;
+    }
+
+    /**
+     * Return the charset that best matches the supplied input data.
+     *
+     * Note though, that because the detection
+     * only looks at the start of the input data,
+     * there is a possibility that the returned charset will fail to handle
+     * the full set of input data.
+     * <p/>
+     * Raise an exception if
+     *  <ul>
+     *    <li>no charset appears to match the data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return a CharsetMatch object representing the best matching charset, or
+     *         <code>null</code> if there are no matches.
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetMatch detect() {
+//   TODO:  A better implementation would be to copy the detect loop from
+//          detectAll(), and cut it short as soon as a match with a high confidence
+//          is found.  This is something to be done later, after things are otherwise
+//          working.
+        CharsetMatch matches[] = detectAll();
+
+        if (matches == null || matches.length == 0) {
+            return null;
+        }
+
+        return matches[0];
+    }
+
+    /**
+     *  Return an array of all charsets that appear to be plausible
+     *  matches with the input data.  The array is ordered with the
+     *  best quality match first.
+     * <p/>
+     * Raise an exception if
+     *  <ul>
+     *    <li>no charsets appear to match the input data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return An array of CharsetMatch objects representing possibly matching charsets.
+     *
+     * @stable ICU 3.4
+     */
+    public CharsetMatch[] detectAll() {
+        CharsetRecognizer csr;
+        int i;
+        int detectResults;
+        int confidence;
+        ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+
+        //  Iterate over all possible charsets, remember all that
+        //    give a match quality > 0.
+        for (i = 0; i < fCSRecognizers.size(); i++) {
+            csr = fCSRecognizers.get(i);
+            detectResults = csr.match(this);
+            confidence = detectResults & 0x000000ff;
+            if (confidence > 0) {
+                // Just to be safe, constrain
+                confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+                // Apply charset hint.
+                if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+                    // Reduce lack of confidence (delta between "sure" and current) by 50%.
+                    confidence += (MAX_CONFIDENCE - confidence) / 2;
+                }
+
+                CharsetMatch m = new CharsetMatch(this, csr, confidence);
+                matches.add(m);
+            }
+        }
+
+        Collections.sort(matches);      // CharsetMatch compares on confidence
+        Collections.reverse(matches);   //  Put best match first.
+        CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
+        resultArray = matches.toArray(resultArray);
+        return resultArray;
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a Java Reader
+     * to access the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p/>
+     *   For the input stream that supplies the character data, markSupported()
+     *   must be true; the  charset detection will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *    be read depends on the characteristics of the data itself.
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     *
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @stable ICU 3.4
+     */
+    public Reader getReader(InputStream in, String declaredEncoding) {
+        setCanonicalDeclaredEncoding(declaredEncoding);
+
+        try {
+            setText(in);
+
+            CharsetMatch match = detect();
+
+            if (match == null) {
+                return null;
+            }
+
+            return match.getReader();
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a String
+     * containing the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     *
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString(byte[] in, String declaredEncoding) {
+        setCanonicalDeclaredEncoding(declaredEncoding);
+
+        try {
+            setText(in);
+
+            CharsetMatch match = detect();
+
+            if (match == null) {
+                return null;
+            }
+
+            return match.getString(-1);
+        } catch (IOException e) {
+            return null;
+        }
+    }
+    //   gave us a byte array.
+
+    /**
+     * Test whether or not input filtering is enabled.
+     *
+     * @return <code>true</code> if input text will be filtered.
+     *
+     * @see #enableInputFilter
+     *
+     * @stable ICU 3.4
+     */
+    public boolean inputFilterEnabled() {
+        return fStripTags;
+    }
+
+    /**
+     * Enable filtering of input text. If filtering is enabled,
+     * text within angle brackets ("<" and ">") will be removed
+     * before detection.
+     *
+     * @param filter <code>true</code> to enable input text filtering.
+     *
+     * @return The previous setting.
+     *
+     * @stable ICU 3.4
+     */
+    public boolean enableInputFilter(boolean filter) {
+        boolean previous = fStripTags;
+
+        fStripTags = filter;
+
+        return previous;
+    }
+
+    /**
+     * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
+     *
+     * @param encoding - name of character encoding
+     */
+    private void setCanonicalDeclaredEncoding(String encoding) {
+        if ((encoding == null) || encoding.isEmpty()) {
+            return;
+        }
+
+        Charset cs = Charset.forName(encoding);
+        if (cs != null) {
+            fDeclaredEncoding = cs.name();
+        }
+    }
+
+    /*
+     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
+     *               it by removing what appears to be html markup.
+     */
+    private void MungeInput() {
+        int srci = 0;
+        int dsti = 0;
+        byte b;
+        boolean inMarkup = false;
+        int openTags = 0;
+        int badTags = 0;
+
+        //
+        //  html / xml markup stripping.
+        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+        //     discard everything within < brackets >
+        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
+        //     guess as to whether the input was actually marked up at all.
+        if (fStripTags) {
+            for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
+                b = fRawInput[srci];
+                if (b == (byte) '<') {
+                    if (inMarkup) {
+                        badTags++;
+                    }
+                    inMarkup = true;
+                    openTags++;
+                }
+
+                if (!inMarkup) {
+                    fInputBytes[dsti++] = b;
+                }
+
+                if (b == (byte) '>') {
+                    inMarkup = false;
+                }
+            }
+
+            fInputLen = dsti;
+        }
+
+        //
+        //  If it looks like this input wasn't marked up, or if it looks like it's
+        //    essentially nothing but markup abandon the markup stripping.
+        //    Detection will have to work on the unstripped input.
+        //
+        if (openTags < 5 || openTags / 5 < badTags ||
+                (fInputLen < 100 && fRawLength > 600)) {
+            int limit = fRawLength;
+
+            if (limit > kBufSize) {
+                limit = kBufSize;
+            }
+
+            for (srci = 0; srci < limit; srci++) {
+                fInputBytes[srci] = fRawInput[srci];
+            }
+            fInputLen = srci;
+        }
+
+        //
+        // Tally up the byte occurence statistics.
+        //   These are available for use by the various detectors.
+        //
+        Arrays.fill(fByteStats, (short) 0);
+        for (srci = 0; srci < fInputLen; srci++) {
+            int val = fInputBytes[srci] & 0x00ff;
+            fByteStats[val]++;
+        }
+
+        fC1Bytes = false;
+        for (int i = 0x80; i <= 0x9F; i += 1) {
+            if (fByteStats[i] != 0) {
+                fC1Bytes = true;
+                break;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 9244cd9..22219ab 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,286 +1,286 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-
-
-/**
- * This class represents a charset that has been identified by a CharsetDetector
- * as a possible encoding for a set of input data.  From an instance of this
- * class, you can ask for a confidence level in the charset identification,
- * or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
- * Instances of this class are created only by CharsetDetectors.
- * <p/>
- * Note:  this class has a natural ordering that is inconsistent with equals.
- *        The natural ordering is based on the match confidence value.
- *
- * @stable ICU 3.4
- */
-public class CharsetMatch implements Comparable<CharsetMatch> {
-
-
-    /**
-     * Bit flag indicating the match is based on the the encoding scheme.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int ENCODING_SCHEME = 1;
-    /**
-     * Bit flag indicating the match is based on the presence of a BOM.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int BOM = 2;
-    /**
-     * Bit flag indicating he match is based on the declared encoding.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int DECLARED_ENCODING = 4;
-    /**
-     * Bit flag indicating the match is based on language statistics.
-     *
-     * @see #getMatchType
-     * @stable ICU 3.4
-     */
-    static public final int LANG_STATISTICS = 8;
-    //
-    //   Private Data
-    //
-    private int fConfidence;
-    private CharsetRecognizer fRecognizer;
-    private byte[] fRawInput = null;     // Original, untouched input bytes.
-    //  If user gave us a byte array, this is it.
-    private int fRawLength;           // Length of data in fRawInput array.
-    private InputStream fInputStream = null;  // User's input stream, or null if the user
-
-    /*
-     *  Constructor.  Implementation internal
-     */
-    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
-        fRecognizer = rec;
-        fConfidence = conf;
-
-        // The references to the original aplication input data must be copied out
-        //   of the charset recognizer to here, in case the application resets the
-        //   recognizer before using this CharsetMatch.
-        if (det.fInputStream == null) {
-            // We only want the existing input byte data if it came straight from the user,
-            //   not if is just the head of a stream.
-            fRawInput = det.fRawInput;
-            fRawLength = det.fRawLength;
-        }
-        fInputStream = det.fInputStream;
-    }
-
-    /**
-     * Create a java.io.Reader for reading the Unicode character data corresponding
-     * to the original byte data supplied to the Charset detect operation.
-     * <p/>
-     * CAUTION:  if the source of the byte data was an InputStream, a Reader
-     * can be created for only one matching char set using this method.  If more
-     * than one charset needs to be tried, the caller will need to reset
-     * the InputStream and create InputStreamReaders itself, based on the charset name.
-     *
-     * @return the Reader for the Unicode character data.
-     *
-     * @stable ICU 3.4
-     */
-    public Reader getReader() {
-        InputStream inputStream = fInputStream;
-
-        if (inputStream == null) {
-            inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
-        }
-
-        try {
-            inputStream.reset();
-            return new InputStreamReader(inputStream, getName());
-        } catch (IOException e) {
-            return null;
-        }
-    }
-
-    /**
-     * Create a Java String from Unicode character data corresponding
-     * to the original byte data supplied to the Charset detect operation.
-     *
-     * @return a String created from the converted input data.
-     *
-     * @stable ICU 3.4
-     */
-    public String getString() throws java.io.IOException {
-        return getString(-1);
-
-    }
-
-    /**
-     * Create a Java String from Unicode character data corresponding
-     * to the original byte data supplied to the Charset detect operation.
-     * The length of the returned string is limited to the specified size;
-     * the string will be trunctated to this length if necessary.  A limit value of
-     * zero or less is ignored, and treated as no limit.
-     *
-     * @param maxLength The maximium length of the String to be created when the
-     *                  source of the data is an input stream, or -1 for
-     *                  unlimited length.
-     * @return a String created from the converted input data.
-     *
-     * @stable ICU 3.4
-     */
-    public String getString(int maxLength) throws java.io.IOException {
-        String result = null;
-        if (fInputStream != null) {
-            StringBuffer sb = new StringBuffer();
-            char[] buffer = new char[1024];
-            Reader reader = getReader();
-            int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
-            int bytesRead = 0;
-
-            while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
-                sb.append(buffer, 0, bytesRead);
-                max -= bytesRead;
-            }
-
-            reader.close();
-
-            return sb.toString();
-        } else {
-            result = new String(fRawInput, getName());
-        }
-        return result;
-
-    }
-
-    /**
-     * Get an indication of the confidence in the charset detected.
-     * Confidence values range from 0-100, with larger numbers indicating
-     * a better match of the input data to the characteristics of the
-     * charset.
-     *
-     * @return the confidence in the charset match
-     *
-     * @stable ICU 3.4
-     */
-    public int getConfidence() {
-        return fConfidence;
-    }
-
-    /**
-     * Return flags indicating what it was about the input data
-     * that caused this charset to be considered as a possible match.
-     * The result is a bitfield containing zero or more of the flags
-     * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
-     * A result of zero means no information is available.
-     * <p>
-     * Note: currently, this method always returns zero.
-     * <p>
-     *
-     * @return the type of match found for this charset.
-     *
-     * @draft ICU 3.4
-     * @provisional This API might change or be removed in a future release.
-     */
-    public int getMatchType() {
-//      TODO: create a list of enum-like constants for common combinations of types of matches.
-        return 0;
-    }
-
-    /**
-     * Get the name of the detected charset.
-     * The name will be one that can be used with other APIs on the
-     * platform that accept charset names.  It is the "Canonical name"
-     * as defined by the class java.nio.charset.Charset; for
-     * charsets that are registered with the IANA charset registry,
-     * this is the MIME-preferred registerd name.
-     *
-     * @see java.nio.charset.Charset
-     * @see java.io.InputStreamReader
-     *
-     * @return The name of the charset.
-     *
-     * @stable ICU 3.4
-     */
-    public String getName() {
-        return fRecognizer.getName();
-    }
-
-    /**
-     * Get the ISO code for the language of the detected charset.
-     *
-     * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
-     *
-     * @stable ICU 3.4
-     */
-    public String getLanguage() {
-        return fRecognizer.getLanguage();
-    }
-
-    /**
-     * Compare to other CharsetMatch objects.
-     * Comparison is based on the match confidence value, which
-     *   allows CharsetDetector.detectAll() to order its results.
-     *
-     * @param o the CharsetMatch object to compare against.
-     * @return a negative integer, zero, or a positive integer as the
-     *          confidence level of this CharsetMatch
-     *          is less than, equal to, or greater than that of
-     *          the argument.
-     * @throws ClassCastException if the argument is not a CharsetMatch.
-     * @stable ICU 3.4
-     */
-    public int compareTo(CharsetMatch other) {
-        int compareResult = 0;
-        if (this.fConfidence > other.fConfidence) {
-            compareResult = 1;
-        } else if (this.fConfidence < other.fConfidence) {
-            compareResult = -1;
-        }
-        return compareResult;
-    }
-
-    /**
-     * compare this CharsetMatch to another based on confidence value
-     * @param o the CharsetMatch object to compare against
-     * @return true if equal
-     */
-    public boolean equals(Object o) {
-        if (o instanceof CharsetMatch) {
-            CharsetMatch that = (CharsetMatch) o;
-            return (this.fConfidence == that.fConfidence);
-        }
-
-        return false;
-    }
-
-    /**
-     * generates a hashCode based on the confidence value
-     * @return the hashCode
-     */
-    public int hashCode() {
-        return fConfidence;
-    }
-    //   gave us a byte array.
-
-    public String toString() {
-        String s = "Match of " + fRecognizer.getName();
-        if (fRecognizer.getLanguage() != null) {
-            s += " in " + fRecognizer.getLanguage();
-        }
-        s += " with confidence " + fConfidence;
-        return s;
-    }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2007, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+/**
+ * This class represents a charset that has been identified by a CharsetDetector
+ * as a possible encoding for a set of input data.  From an instance of this
+ * class, you can ask for a confidence level in the charset identification,
+ * or for Java Reader or String to access the original byte data in Unicode form.
+ * <p/>
+ * Instances of this class are created only by CharsetDetectors.
+ * <p/>
+ * Note:  this class has a natural ordering that is inconsistent with equals.
+ *        The natural ordering is based on the match confidence value.
+ *
+ * @stable ICU 3.4
+ */
+public class CharsetMatch implements Comparable<CharsetMatch> {
+
+
+    /**
+     * Bit flag indicating the match is based on the the encoding scheme.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int ENCODING_SCHEME = 1;
+    /**
+     * Bit flag indicating the match is based on the presence of a BOM.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int BOM = 2;
+    /**
+     * Bit flag indicating he match is based on the declared encoding.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int DECLARED_ENCODING = 4;
+    /**
+     * Bit flag indicating the match is based on language statistics.
+     *
+     * @see #getMatchType
+     * @stable ICU 3.4
+     */
+    static public final int LANG_STATISTICS = 8;
+    //
+    //   Private Data
+    //
+    private int fConfidence;
+    private CharsetRecognizer fRecognizer;
+    private byte[] fRawInput = null;     // Original, untouched input bytes.
+    //  If user gave us a byte array, this is it.
+    private int fRawLength;           // Length of data in fRawInput array.
+    private InputStream fInputStream = null;  // User's input stream, or null if the user
+
+    /*
+     *  Constructor.  Implementation internal
+     */
+    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+        fRecognizer = rec;
+        fConfidence = conf;
+
+        // The references to the original aplication input data must be copied out
+        //   of the charset recognizer to here, in case the application resets the
+        //   recognizer before using this CharsetMatch.
+        if (det.fInputStream == null) {
+            // We only want the existing input byte data if it came straight from the user,
+            //   not if is just the head of a stream.
+            fRawInput = det.fRawInput;
+            fRawLength = det.fRawLength;
+        }
+        fInputStream = det.fInputStream;
+    }
+
+    /**
+     * Create a java.io.Reader for reading the Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     * <p/>
+     * CAUTION:  if the source of the byte data was an InputStream, a Reader
+     * can be created for only one matching char set using this method.  If more
+     * than one charset needs to be tried, the caller will need to reset
+     * the InputStream and create InputStreamReaders itself, based on the charset name.
+     *
+     * @return the Reader for the Unicode character data.
+     *
+     * @stable ICU 3.4
+     */
+    public Reader getReader() {
+        InputStream inputStream = fInputStream;
+
+        if (inputStream == null) {
+            inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
+        }
+
+        try {
+            inputStream.reset();
+            return new InputStreamReader(inputStream, getName());
+        } catch (IOException e) {
+            return null;
+        }
+    }
+
+    /**
+     * Create a Java String from Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     *
+     * @return a String created from the converted input data.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString() throws java.io.IOException {
+        return getString(-1);
+
+    }
+
+    /**
+     * Create a Java String from Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     * The length of the returned string is limited to the specified size;
+     * the string will be trunctated to this length if necessary.  A limit value of
+     * zero or less is ignored, and treated as no limit.
+     *
+     * @param maxLength The maximium length of the String to be created when the
+     *                  source of the data is an input stream, or -1 for
+     *                  unlimited length.
+     * @return a String created from the converted input data.
+     *
+     * @stable ICU 3.4
+     */
+    public String getString(int maxLength) throws java.io.IOException {
+        String result = null;
+        if (fInputStream != null) {
+            StringBuffer sb = new StringBuffer();
+            char[] buffer = new char[1024];
+            Reader reader = getReader();
+            int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
+            int bytesRead = 0;
+
+            while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
+                sb.append(buffer, 0, bytesRead);
+                max -= bytesRead;
+            }
+
+            reader.close();
+
+            return sb.toString();
+        } else {
+            result = new String(fRawInput, getName());
+        }
+        return result;
+
+    }
+
+    /**
+     * Get an indication of the confidence in the charset detected.
+     * Confidence values range from 0-100, with larger numbers indicating
+     * a better match of the input data to the characteristics of the
+     * charset.
+     *
+     * @return the confidence in the charset match
+     *
+     * @stable ICU 3.4
+     */
+    public int getConfidence() {
+        return fConfidence;
+    }
+
+    /**
+     * Return flags indicating what it was about the input data
+     * that caused this charset to be considered as a possible match.
+     * The result is a bitfield containing zero or more of the flags
+     * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
+     * A result of zero means no information is available.
+     * <p>
+     * Note: currently, this method always returns zero.
+     * <p>
+     *
+     * @return the type of match found for this charset.
+     *
+     * @draft ICU 3.4
+     * @provisional This API might change or be removed in a future release.
+     */
+    public int getMatchType() {
+//      TODO: create a list of enum-like constants for common combinations of types of matches.
+        return 0;
+    }
+
+    /**
+     * Get the name of the detected charset.
+     * The name will be one that can be used with other APIs on the
+     * platform that accept charset names.  It is the "Canonical name"
+     * as defined by the class java.nio.charset.Charset; for
+     * charsets that are registered with the IANA charset registry,
+     * this is the MIME-preferred registerd name.
+     *
+     * @see java.nio.charset.Charset
+     * @see java.io.InputStreamReader
+     *
+     * @return The name of the charset.
+     *
+     * @stable ICU 3.4
+     */
+    public String getName() {
+        return fRecognizer.getName();
+    }
+
+    /**
+     * Get the ISO code for the language of the detected charset.
+     *
+     * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
+     *
+     * @stable ICU 3.4
+     */
+    public String getLanguage() {
+        return fRecognizer.getLanguage();
+    }
+
+    /**
+     * Compare to other CharsetMatch objects.
+     * Comparison is based on the match confidence value, which
+     *   allows CharsetDetector.detectAll() to order its results.
+     *
+     * @param o the CharsetMatch object to compare against.
+     * @return a negative integer, zero, or a positive integer as the
+     *          confidence level of this CharsetMatch
+     *          is less than, equal to, or greater than that of
+     *          the argument.
+     * @throws ClassCastException if the argument is not a CharsetMatch.
+     * @stable ICU 3.4
+     */
+    public int compareTo(CharsetMatch other) {
+        int compareResult = 0;
+        if (this.fConfidence > other.fConfidence) {
+            compareResult = 1;
+        } else if (this.fConfidence < other.fConfidence) {
+            compareResult = -1;
+        }
+        return compareResult;
+    }
+
+    /**
+     * compare this CharsetMatch to another based on confidence value
+     * @param o the CharsetMatch object to compare against
+     * @return true if equal
+     */
+    public boolean equals(Object o) {
+        if (o instanceof CharsetMatch) {
+            CharsetMatch that = (CharsetMatch) o;
+            return (this.fConfidence == that.fConfidence);
+        }
+
+        return false;
+    }
+
+    /**
+     * generates a hashCode based on the confidence value
+     * @return the hashCode
+     */
+    public int hashCode() {
+        return fConfidence;
+    }
+    //   gave us a byte array.
+
+    public String toString() {
+        String s = "Match of " + fRecognizer.getName();
+        if (fRecognizer.getLanguage() != null) {
+            s += " in " + fRecognizer.getLanguage();
+        }
+        s += " with confidence " + fConfidence;
+        return s;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 16835d6..129c9a8 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,163 +1,163 @@
-/*
-*******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*/
-package org.apache.tika.parser.txt;
-
-/**
- * class CharsetRecog_2022  part of the ICU charset detection imlementation.
- * This is a superclass for the individual detectors for
- * each of the detectable members of the ISO 2022 family
- * of encodings.
- * <p/>
- * The separate classes are nested within this class.
- *
- * @internal
- */
-abstract class CharsetRecog_2022 extends CharsetRecognizer {
-
-
-    /**
-     * Matching function shared among the 2022 detectors JP, CN and KR
-     * Counts up the number of legal an unrecognized escape sequences in
-     * the sample of text, and computes a score based on the total number &
-     * the proportion that fit the encoding.
-     *
-     * @param text            the byte buffer containing text to analyse
-     * @param textLen         the size of the text in the byte.
-     * @param escapeSequences the byte escape sequences to test for.
-     * @return match quality, in the range of 0-100.
-     */
-    int match(byte[] text, int textLen, byte[][] escapeSequences) {
-        int i, j;
-        int escN;
-        int hits = 0;
-        int misses = 0;
-        int shifts = 0;
-        int quality;
-        scanInput:
-        for (i = 0; i < textLen; i++) {
-            if (text[i] == 0x1b) {
-                checkEscapes:
-                for (escN = 0; escN < escapeSequences.length; escN++) {
-                    byte[] seq = escapeSequences[escN];
-
-                    if ((textLen - i) < seq.length) {
-                        continue checkEscapes;
-                    }
-
-                    for (j = 1; j < seq.length; j++) {
-                        if (seq[j] != text[i + j]) {
-                            continue checkEscapes;
-                        }
-                    }
-
-                    hits++;
-                    i += seq.length - 1;
-                    continue scanInput;
-                }
-
-                misses++;
-            }
-
-            if (text[i] == 0x0e || text[i] == 0x0f) {
-                // Shift in/out
-                shifts++;
-            }
-        }
-
-        if (hits == 0) {
-            return 0;
-        }
-
-        //
-        // Initial quality is based on relative proportion of recongized vs.
-        //   unrecognized escape sequences. 
-        //   All good:  quality = 100;
-        //   half or less good: quality = 0;
-        //   linear inbetween.
-        quality = (100 * hits - 100 * misses) / (hits + misses);
-
-        // Back off quality if there were too few escape sequences seen.
-        //   Include shifts in this computation, so that KR does not get penalized
-        //   for having only a single Escape sequence, but many shifts.
-        if (hits + shifts < 5) {
-            quality -= (5 - (hits + shifts)) * 10;
-        }
-
-        if (quality < 0) {
-            quality = 0;
-        }
-        return quality;
-    }
-
-
-    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
-        private byte[][] escapeSequences = {
-                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
-                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
-                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
-                {0x1b, 0x24, 0x41},         // GB 2312-80
-                {0x1b, 0x24, 0x42},         // JIS X 208-1983
-                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
-                {0x1b, 0x28, 0x42},         // ASCII
-                {0x1b, 0x28, 0x48},         // JIS-Roman
-                {0x1b, 0x28, 0x49},         // Half-width katakana
-                {0x1b, 0x28, 0x4a},         // JIS-Roman
-                {0x1b, 0x2e, 0x41},         // ISO 8859-1
-                {0x1b, 0x2e, 0x46}          // ISO 8859-7
-        };
-
-        String getName() {
-            return "ISO-2022-JP";
-        }
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
-        }
-    }
-
-    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
-        private byte[][] escapeSequences = {
-                {0x1b, 0x24, 0x29, 0x43}
-        };
-
-        String getName() {
-            return "ISO-2022-KR";
-        }
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
-        }
-
-    }
-
-    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
-        private byte[][] escapeSequences = {
-                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
-                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
-                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
-                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
-                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
-                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
-                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
-                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
-                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
-                {0x1b, 0x4e},               // SS2
-                {0x1b, 0x4f},               // SS3
-        };
-
-        String getName() {
-            return "ISO-2022-CN";
-        }
-
-
-        int match(CharsetDetector det) {
-            return match(det.fInputBytes, det.fInputLen, escapeSequences);
-        }
-    }
-
-}
-
+/*
+*******************************************************************************
+* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package org.apache.tika.parser.txt;
+
+/**
+ * class CharsetRecog_2022  part of the ICU charset detection imlementation.
+ * This is a superclass for the individual detectors for
+ * each of the detectable members of the ISO 2022 family
+ * of encodings.
+ * <p/>
+ * The separate classes are nested within this class.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_2022 extends CharsetRecognizer {
+
+
+    /**
+     * Matching function shared among the 2022 detectors JP, CN and KR
+     * Counts up the number of legal an unrecognized escape sequences in
+     * the sample of text, and computes a score based on the total number &
+     * the proportion that fit the encoding.
+     *
+     * @param text            the byte buffer containing text to analyse
+     * @param textLen         the size of the text in the byte.
+     * @param escapeSequences the byte escape sequences to test for.
+     * @return match quality, in the range of 0-100.
+     */
+    int match(byte[] text, int textLen, byte[][] escapeSequences) {
+        int i, j;
+        int escN;
+        int hits = 0;
+        int misses = 0;
+        int shifts = 0;
+        int quality;
+        scanInput:
+        for (i = 0; i < textLen; i++) {
+            if (text[i] == 0x1b) {
+                checkEscapes:
+                for (escN = 0; escN < escapeSequences.length; escN++) {
+                    byte[] seq = escapeSequences[escN];
+
+                    if ((textLen - i) < seq.length) {
+                        continue checkEscapes;
+                    }
+
+                    for (j = 1; j < seq.length; j++) {
+                        if (seq[j] != text[i + j]) {
+                            continue checkEscapes;
+                        }
+                    }
+
+                    hits++;
+                    i += seq.length - 1;
+                    continue scanInput;
+                }
+
+                misses++;
+            }
+
+            if (text[i] == 0x0e || text[i] == 0x0f) {
+                // Shift in/out
+                shifts++;
+            }
+        }
+
+        if (hits == 0) {
+            return 0;
+        }
+
+        //
+        // Initial quality is based on relative proportion of recongized vs.
+        //   unrecognized escape sequences. 
+        //   All good:  quality = 100;
+        //   half or less good: quality = 0;
+        //   linear inbetween.
+        quality = (100 * hits - 100 * misses) / (hits + misses);
+
+        // Back off quality if there were too few escape sequences seen.
+        //   Include shifts in this computation, so that KR does not get penalized
+        //   for having only a single Escape sequence, but many shifts.
+        if (hits + shifts < 5) {
+            quality -= (5 - (hits + shifts)) * 10;
+        }
+
+        if (quality < 0) {
+            quality = 0;
+        }
+        return quality;
+    }
+
+
+    static class CharsetRecog_2022JP extends CharsetRecog_2022 {
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
+                {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
+                {0x1b, 0x24, 0x40},         // JIS C 6226-1978
+                {0x1b, 0x24, 0x41},         // GB 2312-80
+                {0x1b, 0x24, 0x42},         // JIS X 208-1983
+                {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
+                {0x1b, 0x28, 0x42},         // ASCII
+                {0x1b, 0x28, 0x48},         // JIS-Roman
+                {0x1b, 0x28, 0x49},         // Half-width katakana
+                {0x1b, 0x28, 0x4a},         // JIS-Roman
+                {0x1b, 0x2e, 0x41},         // ISO 8859-1
+                {0x1b, 0x2e, 0x46}          // ISO 8859-7
+        };
+
+        String getName() {
+            return "ISO-2022-JP";
+        }
+
+        int match(CharsetDetector det) {
+            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
+    }
+
+    static class CharsetRecog_2022KR extends CharsetRecog_2022 {
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x29, 0x43}
+        };
+
+        String getName() {
+            return "ISO-2022-KR";
+        }
+
+        int match(CharsetDetector det) {
+            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
+
+    }
+
+    static class CharsetRecog_2022CN extends CharsetRecog_2022 {
+        private byte[][] escapeSequences = {
+                {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
+                {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
+                {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
+                {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
+                {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
+                {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
+                {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
+                {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
+                {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
+                {0x1b, 0x4e},               // SS2
+                {0x1b, 0x4f},               // SS3
+        };
+
+        String getName() {
+            return "ISO-2022-CN";
+        }
+
+
+        int match(CharsetDetector det) {
+            return match(det.fInputBytes, det.fInputLen, escapeSequences);
+        }
+    }
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index ad69fa0..55a3957 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,99 +1,99 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
- * others. All Rights Reserved.                                                *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-/**
- * Charset recognizer for UTF-8
- *
- * @internal
- */
-class CharsetRecog_UTF8 extends CharsetRecognizer {
-
-    String getName() {
-        return "UTF-8";
-    }
-
-    /* (non-Javadoc)
-     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
-     */
-    int match(CharsetDetector det) {
-        boolean hasBOM = false;
-        int numValid = 0;
-        int numInvalid = 0;
-        byte input[] = det.fRawInput;
-        int i;
-        int trailBytes = 0;
-        int confidence;
-
-        if (det.fRawLength >= 3 &&
-                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
-            hasBOM = true;
-        }
-
-        // Scan for multi-byte sequences
-        for (i = 0; i < det.fRawLength; i++) {
-            int b = input[i];
-            if ((b & 0x80) == 0) {
-                continue;   // ASCII
-            }
-
-            // Hi bit on char found.  Figure out how long the sequence should be
-            if ((b & 0x0e0) == 0x0c0) {
-                trailBytes = 1;
-            } else if ((b & 0x0f0) == 0x0e0) {
-                trailBytes = 2;
-            } else if ((b & 0x0f8) == 0xf0) {
-                trailBytes = 3;
-            } else {
-                numInvalid++;
-                if (numInvalid > 5) {
-                    break;
-                }
-                trailBytes = 0;
-            }
-
-            // Verify that we've got the right number of trail bytes in the sequence
-            for (; ; ) {
-                i++;
-                if (i >= det.fRawLength) {
-                    break;
-                }
-                b = input[i];
-                if ((b & 0xc0) != 0x080) {
-                    numInvalid++;
-                    break;
-                }
-                if (--trailBytes == 0) {
-                    numValid++;
-                    break;
-                }
-            }
-
-        }
-
-        // Cook up some sort of confidence score, based on presense of a BOM
-        //    and the existence of valid and/or invalid multi-byte sequences.
-        confidence = 0;
-        if (hasBOM && numInvalid == 0) {
-            confidence = 100;
-        } else if (hasBOM && numValid > numInvalid * 10) {
-            confidence = 80;
-        } else if (numValid > 3 && numInvalid == 0) {
-            confidence = 100;
-        } else if (numValid > 0 && numInvalid == 0) {
-            confidence = 80;
-        } else if (numValid == 0 && numInvalid == 0) {
-            // Plain ASCII.  
-            confidence = 10;
-        } else if (numValid > numInvalid * 10) {
-            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
-            confidence = 25;
-        }
-        return confidence;
-    }
-
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
+ * others. All Rights Reserved.                                                *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Charset recognizer for UTF-8
+ *
+ * @internal
+ */
+class CharsetRecog_UTF8 extends CharsetRecognizer {
+
+    String getName() {
+        return "UTF-8";
+    }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+     */
+    int match(CharsetDetector det) {
+        boolean hasBOM = false;
+        int numValid = 0;
+        int numInvalid = 0;
+        byte input[] = det.fRawInput;
+        int i;
+        int trailBytes = 0;
+        int confidence;
+
+        if (det.fRawLength >= 3 &&
+                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
+            hasBOM = true;
+        }
+
+        // Scan for multi-byte sequences
+        for (i = 0; i < det.fRawLength; i++) {
+            int b = input[i];
+            if ((b & 0x80) == 0) {
+                continue;   // ASCII
+            }
+
+            // Hi bit on char found.  Figure out how long the sequence should be
+            if ((b & 0x0e0) == 0x0c0) {
+                trailBytes = 1;
+            } else if ((b & 0x0f0) == 0x0e0) {
+                trailBytes = 2;
+            } else if ((b & 0x0f8) == 0xf0) {
+                trailBytes = 3;
+            } else {
+                numInvalid++;
+                if (numInvalid > 5) {
+                    break;
+                }
+                trailBytes = 0;
+            }
+
+            // Verify that we've got the right number of trail bytes in the sequence
+            for (; ; ) {
+                i++;
+                if (i >= det.fRawLength) {
+                    break;
+                }
+                b = input[i];
+                if ((b & 0xc0) != 0x080) {
+                    numInvalid++;
+                    break;
+                }
+                if (--trailBytes == 0) {
+                    numValid++;
+                    break;
+                }
+            }
+
+        }
+
+        // Cook up some sort of confidence score, based on presense of a BOM
+        //    and the existence of valid and/or invalid multi-byte sequences.
+        confidence = 0;
+        if (hasBOM && numInvalid == 0) {
+            confidence = 100;
+        } else if (hasBOM && numValid > numInvalid * 10) {
+            confidence = 80;
+        } else if (numValid > 3 && numInvalid == 0) {
+            confidence = 100;
+        } else if (numValid > 0 && numInvalid == 0) {
+            confidence = 80;
+        } else if (numValid == 0 && numInvalid == 0) {
+            // Plain ASCII.  
+            confidence = 10;
+        } else if (numValid > numInvalid * 10) {
+            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
+            confidence = 25;
+        }
+        return confidence;
+    }
+
+}