You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:15 UTC
[09/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 77773e0..f9df9e0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,544 +1,544 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-
-
-/**
- * <code>CharsetDetector</code> provides a facility for detecting the
- * charset or encoding of character data in an unknown format.
- * The input data can either be from an input stream or an array of bytes.
- * The result of the detection operation is a list of possibly matching
- * charsets, or, for simple use, you can just ask for a Java Reader that
- * will will work over the input data.
- * <p/>
- * Character set detection is at best an imprecise operation. The detection
- * process will attempt to identify the charset that best matches the characteristics
- * of the byte data, but the process is partly statistical in nature, and
- * the results can not be guaranteed to always be correct.
- * <p/>
- * For best accuracy in charset detection, the input data should be primarily
- * in a single language, and a minimum of a few hundred bytes worth of plain text
- * in the language are needed. The detection process will attempt to
- * ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
- * @stable ICU 3.4
- */
-public class CharsetDetector {
-
-// Question: Should we have getters corresponding to the setters for input text
-// and declared encoding?
-
-// A thought: If we were to create our own type of Java Reader, we could defer
-// figuring out an actual charset for data that starts out with too much English
-// only ASCII until the user actually read through to something that didn't look
-// like 7 bit English. If nothing else ever appeared, we would never need to
-// actually choose the "real" charset. All assuming that the application just
-// wants the data, and doesn't care about a char set name.
-
- private static final int kBufSize = 12000;
- private static final int MAX_CONFIDENCE = 100;
- private static String[] fCharsetNames;
- /*
- * List of recognizers for all charsets known to the implementation.
- */
- private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
- /*
- * The following items are accessed by individual CharsetRecongizers during
- * the recognition process
- *
- */
- byte[] fInputBytes = // The text to be checked. Markup will have been
- new byte[kBufSize]; // removed if appropriate.
- int fInputLen; // Length of the byte data in fInputText.
- short fByteStats[] = // byte frequency statistics for the input text.
- new short[256]; // Value is percent, not absolute.
- boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
- false;
- String fDeclaredEncoding;
- //
- // Stuff private to CharsetDetector
- //
- byte[] fRawInput; // Original, untouched input bytes.
- // If user gave us a byte array, this is it.
- // If user gave us a stream, it's read to a
- // buffer here.
- int fRawLength; // Length of data in fRawInput array.
- InputStream fInputStream; // User's input stream, or null if the user
- boolean fStripTags = // If true, setText() will strip tags from input text.
- false;
-
- /**
- * Constructor
- *
- * @stable ICU 3.4
- */
- public CharsetDetector() {
- }
-
- /**
- * Get the names of all char sets that can be recognized by the char set detector.
- *
- * @return an array of the names of all charsets that can be recognized
- * by the charset detector.
- *
- * @stable ICU 3.4
- */
- public static String[] getAllDetectableCharsets() {
- return fCharsetNames;
- }
-
- /*
- * Create the singleton instances of the CharsetRecognizer classes
- */
- private static ArrayList<CharsetRecognizer> createRecognizers() {
- ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
- recognizers.add(new CharsetRecog_UTF8());
-
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
- // Create an array of all charset names, as a side effect.
- // Needed for the getAllDetectableCharsets() API.
- String[] charsetNames = new String[recognizers.size()];
- int out = 0;
-
- for (CharsetRecognizer recognizer : recognizers) {
- String name = recognizer.getName();
-
- if (out == 0 || !name.equals(charsetNames[out - 1])) {
- charsetNames[out++] = name;
- }
- }
-
- fCharsetNames = new String[out];
- System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
- return recognizers;
- }
-
- /**
- * Set the declared encoding for charset detection.
- * The declared encoding of an input text is an encoding obtained
- * from an http header or xml declaration or similar source that
- * can be provided as additional information to the charset detector.
- * A match between a declared encoding and a possible detected encoding
- * will raise the quality of that detected encoding by a small delta,
- * and will also appear as a "reason" for the match.
- * <p/>
- * A declared encoding that is incompatible with the input data being
- * analyzed will not be added to the list of possible encodings.
- *
- * @param encoding The declared encoding
- *
- * @stable ICU 3.4
- */
- public CharsetDetector setDeclaredEncoding(String encoding) {
- setCanonicalDeclaredEncoding(encoding);
- return this;
- }
-
- /**
- * Set the input text (byte) data whose charset is to be detected.
- *
- * @param in the input text of unknown encoding
- *
- * @return This CharsetDetector
- *
- * @stable ICU 3.4
- */
- public CharsetDetector setText(byte[] in) {
- fRawInput = in;
- fRawLength = in.length;
-
- MungeInput();
-
- return this;
- }
- // Value is rounded up, so zero really means zero occurences.
-
- /**
- * Set the input text (byte) data whose charset is to be detected.
- * <p/>
- * The input stream that supplies the character data must have markSupported()
- * == true; the charset detection process will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
- *
- * @param in the input text of unknown encoding
- *
- * @return This CharsetDetector
- *
- * @stable ICU 3.4
- */
-
- public CharsetDetector setText(InputStream in) throws IOException {
- fInputStream = in;
- fInputStream.mark(kBufSize);
- fRawInput = new byte[kBufSize]; // Always make a new buffer because the
- // previous one may have come from the caller,
- // in which case we can't touch it.
- fRawLength = 0;
- int remainingLength = kBufSize;
- while (remainingLength > 0) {
- // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
- int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
- if (bytesRead <= 0) {
- break;
- }
- fRawLength += bytesRead;
- remainingLength -= bytesRead;
- }
- fInputStream.reset();
-
- MungeInput(); // Strip html markup, collect byte stats.
- return this;
- }
-
- /**
- * Return the charset that best matches the supplied input data.
- *
- * Note though, that because the detection
- * only looks at the start of the input data,
- * there is a possibility that the returned charset will fail to handle
- * the full set of input data.
- * <p/>
- * Raise an exception if
- * <ul>
- * <li>no charset appears to match the data.</li>
- * <li>no input text has been provided</li>
- * </ul>
- *
- * @return a CharsetMatch object representing the best matching charset, or
- * <code>null</code> if there are no matches.
- *
- * @stable ICU 3.4
- */
- public CharsetMatch detect() {
-// TODO: A better implementation would be to copy the detect loop from
-// detectAll(), and cut it short as soon as a match with a high confidence
-// is found. This is something to be done later, after things are otherwise
-// working.
- CharsetMatch matches[] = detectAll();
-
- if (matches == null || matches.length == 0) {
- return null;
- }
-
- return matches[0];
- }
-
- /**
- * Return an array of all charsets that appear to be plausible
- * matches with the input data. The array is ordered with the
- * best quality match first.
- * <p/>
- * Raise an exception if
- * <ul>
- * <li>no charsets appear to match the input data.</li>
- * <li>no input text has been provided</li>
- * </ul>
- *
- * @return An array of CharsetMatch objects representing possibly matching charsets.
- *
- * @stable ICU 3.4
- */
- public CharsetMatch[] detectAll() {
- CharsetRecognizer csr;
- int i;
- int detectResults;
- int confidence;
- ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-
- // Iterate over all possible charsets, remember all that
- // give a match quality > 0.
- for (i = 0; i < fCSRecognizers.size(); i++) {
- csr = fCSRecognizers.get(i);
- detectResults = csr.match(this);
- confidence = detectResults & 0x000000ff;
- if (confidence > 0) {
- // Just to be safe, constrain
- confidence = Math.min(confidence, MAX_CONFIDENCE);
-
- // Apply charset hint.
- if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
- // Reduce lack of confidence (delta between "sure" and current) by 50%.
- confidence += (MAX_CONFIDENCE - confidence) / 2;
- }
-
- CharsetMatch m = new CharsetMatch(this, csr, confidence);
- matches.add(m);
- }
- }
-
- Collections.sort(matches); // CharsetMatch compares on confidence
- Collections.reverse(matches); // Put best match first.
- CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
- resultArray = matches.toArray(resultArray);
- return resultArray;
- }
-
- /**
- * Autodetect the charset of an inputStream, and return a Java Reader
- * to access the converted input data.
- * <p/>
- * This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
- * <p/>
- * For the input stream that supplies the character data, markSupported()
- * must be true; the charset detection will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
- *<p/>
- * Raise an exception if no charsets appear to match the input data.
- *
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
- * @stable ICU 3.4
- */
- public Reader getReader(InputStream in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
-
- try {
- setText(in);
-
- CharsetMatch match = detect();
-
- if (match == null) {
- return null;
- }
-
- return match.getReader();
- } catch (IOException e) {
- return null;
- }
- }
-
- /**
- * Autodetect the charset of an inputStream, and return a String
- * containing the converted input data.
- * <p/>
- * This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
- *<p/>
- * Raise an exception if no charsets appear to match the input data.
- *
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
- * @stable ICU 3.4
- */
- public String getString(byte[] in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
-
- try {
- setText(in);
-
- CharsetMatch match = detect();
-
- if (match == null) {
- return null;
- }
-
- return match.getString(-1);
- } catch (IOException e) {
- return null;
- }
- }
- // gave us a byte array.
-
- /**
- * Test whether or not input filtering is enabled.
- *
- * @return <code>true</code> if input text will be filtered.
- *
- * @see #enableInputFilter
- *
- * @stable ICU 3.4
- */
- public boolean inputFilterEnabled() {
- return fStripTags;
- }
-
- /**
- * Enable filtering of input text. If filtering is enabled,
- * text within angle brackets ("<" and ">") will be removed
- * before detection.
- *
- * @param filter <code>true</code> to enable input text filtering.
- *
- * @return The previous setting.
- *
- * @stable ICU 3.4
- */
- public boolean enableInputFilter(boolean filter) {
- boolean previous = fStripTags;
-
- fStripTags = filter;
-
- return previous;
- }
-
- /**
- * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
- *
- * @param encoding - name of character encoding
- */
- private void setCanonicalDeclaredEncoding(String encoding) {
- if ((encoding == null) || encoding.isEmpty()) {
- return;
- }
-
- Charset cs = Charset.forName(encoding);
- if (cs != null) {
- fDeclaredEncoding = cs.name();
- }
- }
-
- /*
- * MungeInput - after getting a set of raw input data to be analyzed, preprocess
- * it by removing what appears to be html markup.
- */
- private void MungeInput() {
- int srci = 0;
- int dsti = 0;
- byte b;
- boolean inMarkup = false;
- int openTags = 0;
- int badTags = 0;
-
- //
- // html / xml markup stripping.
- // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
- // discard everything within < brackets >
- // Count how many total '<' and illegal (nested) '<' occur, so we can make some
- // guess as to whether the input was actually marked up at all.
- if (fStripTags) {
- for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
- b = fRawInput[srci];
- if (b == (byte) '<') {
- if (inMarkup) {
- badTags++;
- }
- inMarkup = true;
- openTags++;
- }
-
- if (!inMarkup) {
- fInputBytes[dsti++] = b;
- }
-
- if (b == (byte) '>') {
- inMarkup = false;
- }
- }
-
- fInputLen = dsti;
- }
-
- //
- // If it looks like this input wasn't marked up, or if it looks like it's
- // essentially nothing but markup abandon the markup stripping.
- // Detection will have to work on the unstripped input.
- //
- if (openTags < 5 || openTags / 5 < badTags ||
- (fInputLen < 100 && fRawLength > 600)) {
- int limit = fRawLength;
-
- if (limit > kBufSize) {
- limit = kBufSize;
- }
-
- for (srci = 0; srci < limit; srci++) {
- fInputBytes[srci] = fRawInput[srci];
- }
- fInputLen = srci;
- }
-
- //
- // Tally up the byte occurence statistics.
- // These are available for use by the various detectors.
- //
- Arrays.fill(fByteStats, (short) 0);
- for (srci = 0; srci < fInputLen; srci++) {
- int val = fInputBytes[srci] & 0x00ff;
- fByteStats[val]++;
- }
-
- fC1Bytes = false;
- for (int i = 0x80; i <= 0x9F; i += 1) {
- if (fByteStats[i] != 0) {
- fC1Bytes = true;
- break;
- }
- }
- }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation. The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed. The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @stable ICU 3.4
+ */
+public class CharsetDetector {
+
+// Question: Should we have getters corresponding to the setters for input text
+// and declared encoding?
+
+// A thought: If we were to create our own type of Java Reader, we could defer
+// figuring out an actual charset for data that starts out with too much English
+// only ASCII until the user actually read through to something that didn't look
+// like 7 bit English. If nothing else ever appeared, we would never need to
+// actually choose the "real" charset. All assuming that the application just
+// wants the data, and doesn't care about a char set name.
+
+ private static final int kBufSize = 12000;
+ private static final int MAX_CONFIDENCE = 100;
+ private static String[] fCharsetNames;
+ /*
+ * List of recognizers for all charsets known to the implementation.
+ */
+ private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+ /*
+ * The following items are accessed by individual CharsetRecongizers during
+ * the recognition process
+ *
+ */
+ byte[] fInputBytes = // The text to be checked. Markup will have been
+ new byte[kBufSize]; // removed if appropriate.
+ int fInputLen; // Length of the byte data in fInputText.
+ short fByteStats[] = // byte frequency statistics for the input text.
+ new short[256]; // Value is percent, not absolute.
+ boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
+ false;
+ String fDeclaredEncoding;
+ //
+ // Stuff private to CharsetDetector
+ //
+ byte[] fRawInput; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ // If user gave us a stream, it's read to a
+ // buffer here.
+ int fRawLength; // Length of data in fRawInput array.
+ InputStream fInputStream; // User's input stream, or null if the user
+ boolean fStripTags = // If true, setText() will strip tags from input text.
+ false;
+
+ /**
+ * Constructor
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector() {
+ }
+
+ /**
+ * Get the names of all char sets that can be recognized by the char set detector.
+ *
+ * @return an array of the names of all charsets that can be recognized
+ * by the charset detector.
+ *
+ * @stable ICU 3.4
+ */
+ public static String[] getAllDetectableCharsets() {
+ return fCharsetNames;
+ }
+
+ /*
+ * Create the singleton instances of the CharsetRecognizer classes
+ */
+ private static ArrayList<CharsetRecognizer> createRecognizers() {
+ ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
+
+ recognizers.add(new CharsetRecog_UTF8());
+
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+ // Create an array of all charset names, as a side effect.
+ // Needed for the getAllDetectableCharsets() API.
+ String[] charsetNames = new String[recognizers.size()];
+ int out = 0;
+
+ for (CharsetRecognizer recognizer : recognizers) {
+ String name = recognizer.getName();
+
+ if (out == 0 || !name.equals(charsetNames[out - 1])) {
+ charsetNames[out++] = name;
+ }
+ }
+
+ fCharsetNames = new String[out];
+ System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+ return recognizers;
+ }
+
+ /**
+ * Set the declared encoding for charset detection.
+ * The declared encoding of an input text is an encoding obtained
+ * from an http header or xml declaration or similar source that
+ * can be provided as additional information to the charset detector.
+ * A match between a declared encoding and a possible detected encoding
+ * will raise the quality of that detected encoding by a small delta,
+ * and will also appear as a "reason" for the match.
+ * <p/>
+ * A declared encoding that is incompatible with the input data being
+ * analyzed will not be added to the list of possible encodings.
+ *
+ * @param encoding The declared encoding
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector setDeclaredEncoding(String encoding) {
+ setCanonicalDeclaredEncoding(encoding);
+ return this;
+ }
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector setText(byte[] in) {
+ fRawInput = in;
+ fRawLength = in.length;
+
+ MungeInput();
+
+ return this;
+ }
+ // Value is rounded up, so zero really means zero occurences.
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ * <p/>
+ * The input stream that supplies the character data must have markSupported()
+ * == true; the charset detection process will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @stable ICU 3.4
+ */
+
+ public CharsetDetector setText(InputStream in) throws IOException {
+ fInputStream = in;
+ fInputStream.mark(kBufSize);
+ fRawInput = new byte[kBufSize]; // Always make a new buffer because the
+ // previous one may have come from the caller,
+ // in which case we can't touch it.
+ fRawLength = 0;
+ int remainingLength = kBufSize;
+ while (remainingLength > 0) {
+ // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
+ int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+ if (bytesRead <= 0) {
+ break;
+ }
+ fRawLength += bytesRead;
+ remainingLength -= bytesRead;
+ }
+ fInputStream.reset();
+
+ MungeInput(); // Strip html markup, collect byte stats.
+ return this;
+ }
+
+ /**
+ * Return the charset that best matches the supplied input data.
+ *
+ * Note though, that because the detection
+ * only looks at the start of the input data,
+ * there is a possibility that the returned charset will fail to handle
+ * the full set of input data.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charset appears to match the data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return a CharsetMatch object representing the best matching charset, or
+ * <code>null</code> if there are no matches.
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetMatch detect() {
+// TODO: A better implementation would be to copy the detect loop from
+// detectAll(), and cut it short as soon as a match with a high confidence
+// is found. This is something to be done later, after things are otherwise
+// working.
+ CharsetMatch matches[] = detectAll();
+
+ if (matches == null || matches.length == 0) {
+ return null;
+ }
+
+ return matches[0];
+ }
+
+ /**
+ * Return an array of all charsets that appear to be plausible
+ * matches with the input data. The array is ordered with the
+ * best quality match first.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charsets appear to match the input data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return An array of CharsetMatch objects representing possibly matching charsets.
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetMatch[] detectAll() {
+ CharsetRecognizer csr;
+ int i;
+ int detectResults;
+ int confidence;
+ ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+
+ // Iterate over all possible charsets, remember all that
+ // give a match quality > 0.
+ for (i = 0; i < fCSRecognizers.size(); i++) {
+ csr = fCSRecognizers.get(i);
+ detectResults = csr.match(this);
+ confidence = detectResults & 0x000000ff;
+ if (confidence > 0) {
+ // Just to be safe, constrain
+ confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+ // Apply charset hint.
+ if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+ // Reduce lack of confidence (delta between "sure" and current) by 50%.
+ confidence += (MAX_CONFIDENCE - confidence) / 2;
+ }
+
+ CharsetMatch m = new CharsetMatch(this, csr, confidence);
+ matches.add(m);
+ }
+ }
+
+ Collections.sort(matches); // CharsetMatch compares on confidence
+ Collections.reverse(matches); // Put best match first.
+ CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
+ resultArray = matches.toArray(resultArray);
+ return resultArray;
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a Java Reader
+ * to access the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+ * <p/>
+ * For the input stream that supplies the character data, markSupported()
+ * must be true; the charset detection will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @stable ICU 3.4
+ */
+ public Reader getReader(InputStream in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
+
+ try {
+ setText(in);
+
+ CharsetMatch match = detect();
+
+ if (match == null) {
+ return null;
+ }
+
+ return match.getReader();
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a String
+ * containing the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString(byte[] in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
+
+ try {
+ setText(in);
+
+ CharsetMatch match = detect();
+
+ if (match == null) {
+ return null;
+ }
+
+ return match.getString(-1);
+ } catch (IOException e) {
+ return null;
+ }
+ }
+ // gave us a byte array.
+
+ /**
+ * Test whether or not input filtering is enabled.
+ *
+ * @return <code>true</code> if input text will be filtered.
+ *
+ * @see #enableInputFilter
+ *
+ * @stable ICU 3.4
+ */
+ public boolean inputFilterEnabled() {
+ return fStripTags;
+ }
+
+ /**
+ * Enable filtering of input text. If filtering is enabled,
+ * text within angle brackets ("<" and ">") will be removed
+ * before detection.
+ *
+ * @param filter <code>true</code> to enable input text filtering.
+ *
+ * @return The previous setting.
+ *
+ * @stable ICU 3.4
+ */
+ public boolean enableInputFilter(boolean filter) {
+ boolean previous = fStripTags;
+
+ fStripTags = filter;
+
+ return previous;
+ }
+
+ /**
+ * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
+ *
+ * @param encoding - name of character encoding
+ */
+ private void setCanonicalDeclaredEncoding(String encoding) {
+ if ((encoding == null) || encoding.isEmpty()) {
+ return;
+ }
+
+ Charset cs = Charset.forName(encoding);
+ if (cs != null) {
+ fDeclaredEncoding = cs.name();
+ }
+ }
+
+ /*
+ * MungeInput - after getting a set of raw input data to be analyzed, preprocess
+ * it by removing what appears to be html markup.
+ */
+ private void MungeInput() {
+ int srci = 0;
+ int dsti = 0;
+ byte b;
+ boolean inMarkup = false;
+ int openTags = 0;
+ int badTags = 0;
+
+ //
+ // html / xml markup stripping.
+ // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+ // discard everything within < brackets >
+ // Count how many total '<' and illegal (nested) '<' occur, so we can make some
+ // guess as to whether the input was actually marked up at all.
+ if (fStripTags) {
+ for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
+ b = fRawInput[srci];
+ if (b == (byte) '<') {
+ if (inMarkup) {
+ badTags++;
+ }
+ inMarkup = true;
+ openTags++;
+ }
+
+ if (!inMarkup) {
+ fInputBytes[dsti++] = b;
+ }
+
+ if (b == (byte) '>') {
+ inMarkup = false;
+ }
+ }
+
+ fInputLen = dsti;
+ }
+
+ //
+ // If it looks like this input wasn't marked up, or if it looks like it's
+ // essentially nothing but markup abandon the markup stripping.
+ // Detection will have to work on the unstripped input.
+ //
+ if (openTags < 5 || openTags / 5 < badTags ||
+ (fInputLen < 100 && fRawLength > 600)) {
+ int limit = fRawLength;
+
+ if (limit > kBufSize) {
+ limit = kBufSize;
+ }
+
+ for (srci = 0; srci < limit; srci++) {
+ fInputBytes[srci] = fRawInput[srci];
+ }
+ fInputLen = srci;
+ }
+
+ //
+ // Tally up the byte occurence statistics.
+ // These are available for use by the various detectors.
+ //
+ Arrays.fill(fByteStats, (short) 0);
+ for (srci = 0; srci < fInputLen; srci++) {
+ int val = fInputBytes[srci] & 0x00ff;
+ fByteStats[val]++;
+ }
+
+ fC1Bytes = false;
+ for (int i = 0x80; i <= 0x9F; i += 1) {
+ if (fByteStats[i] != 0) {
+ fC1Bytes = true;
+ break;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 9244cd9..22219ab 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,286 +1,286 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-
-
-/**
- * This class represents a charset that has been identified by a CharsetDetector
- * as a possible encoding for a set of input data. From an instance of this
- * class, you can ask for a confidence level in the charset identification,
- * or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
- * Instances of this class are created only by CharsetDetectors.
- * <p/>
- * Note: this class has a natural ordering that is inconsistent with equals.
- * The natural ordering is based on the match confidence value.
- *
- * @stable ICU 3.4
- */
-public class CharsetMatch implements Comparable<CharsetMatch> {
-
-
- /**
- * Bit flag indicating the match is based on the the encoding scheme.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int ENCODING_SCHEME = 1;
- /**
- * Bit flag indicating the match is based on the presence of a BOM.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int BOM = 2;
- /**
- * Bit flag indicating he match is based on the declared encoding.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int DECLARED_ENCODING = 4;
- /**
- * Bit flag indicating the match is based on language statistics.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int LANG_STATISTICS = 8;
- //
- // Private Data
- //
- private int fConfidence;
- private CharsetRecognizer fRecognizer;
- private byte[] fRawInput = null; // Original, untouched input bytes.
- // If user gave us a byte array, this is it.
- private int fRawLength; // Length of data in fRawInput array.
- private InputStream fInputStream = null; // User's input stream, or null if the user
-
- /*
- * Constructor. Implementation internal
- */
- CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
- fRecognizer = rec;
- fConfidence = conf;
-
- // The references to the original aplication input data must be copied out
- // of the charset recognizer to here, in case the application resets the
- // recognizer before using this CharsetMatch.
- if (det.fInputStream == null) {
- // We only want the existing input byte data if it came straight from the user,
- // not if is just the head of a stream.
- fRawInput = det.fRawInput;
- fRawLength = det.fRawLength;
- }
- fInputStream = det.fInputStream;
- }
-
- /**
- * Create a java.io.Reader for reading the Unicode character data corresponding
- * to the original byte data supplied to the Charset detect operation.
- * <p/>
- * CAUTION: if the source of the byte data was an InputStream, a Reader
- * can be created for only one matching char set using this method. If more
- * than one charset needs to be tried, the caller will need to reset
- * the InputStream and create InputStreamReaders itself, based on the charset name.
- *
- * @return the Reader for the Unicode character data.
- *
- * @stable ICU 3.4
- */
- public Reader getReader() {
- InputStream inputStream = fInputStream;
-
- if (inputStream == null) {
- inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
- }
-
- try {
- inputStream.reset();
- return new InputStreamReader(inputStream, getName());
- } catch (IOException e) {
- return null;
- }
- }
-
- /**
- * Create a Java String from Unicode character data corresponding
- * to the original byte data supplied to the Charset detect operation.
- *
- * @return a String created from the converted input data.
- *
- * @stable ICU 3.4
- */
- public String getString() throws java.io.IOException {
- return getString(-1);
-
- }
-
- /**
- * Create a Java String from Unicode character data corresponding
- * to the original byte data supplied to the Charset detect operation.
- * The length of the returned string is limited to the specified size;
- * the string will be trunctated to this length if necessary. A limit value of
- * zero or less is ignored, and treated as no limit.
- *
- * @param maxLength The maximium length of the String to be created when the
- * source of the data is an input stream, or -1 for
- * unlimited length.
- * @return a String created from the converted input data.
- *
- * @stable ICU 3.4
- */
- public String getString(int maxLength) throws java.io.IOException {
- String result = null;
- if (fInputStream != null) {
- StringBuffer sb = new StringBuffer();
- char[] buffer = new char[1024];
- Reader reader = getReader();
- int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
- int bytesRead = 0;
-
- while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
- sb.append(buffer, 0, bytesRead);
- max -= bytesRead;
- }
-
- reader.close();
-
- return sb.toString();
- } else {
- result = new String(fRawInput, getName());
- }
- return result;
-
- }
-
- /**
- * Get an indication of the confidence in the charset detected.
- * Confidence values range from 0-100, with larger numbers indicating
- * a better match of the input data to the characteristics of the
- * charset.
- *
- * @return the confidence in the charset match
- *
- * @stable ICU 3.4
- */
- public int getConfidence() {
- return fConfidence;
- }
-
- /**
- * Return flags indicating what it was about the input data
- * that caused this charset to be considered as a possible match.
- * The result is a bitfield containing zero or more of the flags
- * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
- * A result of zero means no information is available.
- * <p>
- * Note: currently, this method always returns zero.
- * <p>
- *
- * @return the type of match found for this charset.
- *
- * @draft ICU 3.4
- * @provisional This API might change or be removed in a future release.
- */
- public int getMatchType() {
-// TODO: create a list of enum-like constants for common combinations of types of matches.
- return 0;
- }
-
- /**
- * Get the name of the detected charset.
- * The name will be one that can be used with other APIs on the
- * platform that accept charset names. It is the "Canonical name"
- * as defined by the class java.nio.charset.Charset; for
- * charsets that are registered with the IANA charset registry,
- * this is the MIME-preferred registerd name.
- *
- * @see java.nio.charset.Charset
- * @see java.io.InputStreamReader
- *
- * @return The name of the charset.
- *
- * @stable ICU 3.4
- */
- public String getName() {
- return fRecognizer.getName();
- }
-
- /**
- * Get the ISO code for the language of the detected charset.
- *
- * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
- *
- * @stable ICU 3.4
- */
- public String getLanguage() {
- return fRecognizer.getLanguage();
- }
-
- /**
- * Compare to other CharsetMatch objects.
- * Comparison is based on the match confidence value, which
- * allows CharsetDetector.detectAll() to order its results.
- *
- * @param o the CharsetMatch object to compare against.
- * @return a negative integer, zero, or a positive integer as the
- * confidence level of this CharsetMatch
- * is less than, equal to, or greater than that of
- * the argument.
- * @throws ClassCastException if the argument is not a CharsetMatch.
- * @stable ICU 3.4
- */
- public int compareTo(CharsetMatch other) {
- int compareResult = 0;
- if (this.fConfidence > other.fConfidence) {
- compareResult = 1;
- } else if (this.fConfidence < other.fConfidence) {
- compareResult = -1;
- }
- return compareResult;
- }
-
- /**
- * compare this CharsetMatch to another based on confidence value
- * @param o the CharsetMatch object to compare against
- * @return true if equal
- */
- public boolean equals(Object o) {
- if (o instanceof CharsetMatch) {
- CharsetMatch that = (CharsetMatch) o;
- return (this.fConfidence == that.fConfidence);
- }
-
- return false;
- }
-
- /**
- * generates a hashCode based on the confidence value
- * @return the hashCode
- */
- public int hashCode() {
- return fConfidence;
- }
- // gave us a byte array.
-
- public String toString() {
- String s = "Match of " + fRecognizer.getName();
- if (fRecognizer.getLanguage() != null) {
- s += " in " + fRecognizer.getLanguage();
- }
- s += " with confidence " + fConfidence;
- return s;
- }
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2007, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+/**
+ * This class represents a charset that has been identified by a CharsetDetector
+ * as a possible encoding for a set of input data. From an instance of this
+ * class, you can ask for a confidence level in the charset identification,
+ * or for Java Reader or String to access the original byte data in Unicode form.
+ * <p/>
+ * Instances of this class are created only by CharsetDetectors.
+ * <p/>
+ * Note: this class has a natural ordering that is inconsistent with equals.
+ * The natural ordering is based on the match confidence value.
+ *
+ * @stable ICU 3.4
+ */
+public class CharsetMatch implements Comparable<CharsetMatch> {
+
+
+ /**
+ * Bit flag indicating the match is based on the the encoding scheme.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int ENCODING_SCHEME = 1;
+ /**
+ * Bit flag indicating the match is based on the presence of a BOM.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int BOM = 2;
+ /**
+ * Bit flag indicating he match is based on the declared encoding.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int DECLARED_ENCODING = 4;
+ /**
+ * Bit flag indicating the match is based on language statistics.
+ *
+ * @see #getMatchType
+ * @stable ICU 3.4
+ */
+ static public final int LANG_STATISTICS = 8;
+ //
+ // Private Data
+ //
+ private int fConfidence;
+ private CharsetRecognizer fRecognizer;
+ private byte[] fRawInput = null; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ private int fRawLength; // Length of data in fRawInput array.
+ private InputStream fInputStream = null; // User's input stream, or null if the user
+
+ /*
+ * Constructor. Implementation internal
+ */
+ CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+ fRecognizer = rec;
+ fConfidence = conf;
+
+ // The references to the original aplication input data must be copied out
+ // of the charset recognizer to here, in case the application resets the
+ // recognizer before using this CharsetMatch.
+ if (det.fInputStream == null) {
+ // We only want the existing input byte data if it came straight from the user,
+ // not if is just the head of a stream.
+ fRawInput = det.fRawInput;
+ fRawLength = det.fRawLength;
+ }
+ fInputStream = det.fInputStream;
+ }
+
+ /**
+ * Create a java.io.Reader for reading the Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ * <p/>
+ * CAUTION: if the source of the byte data was an InputStream, a Reader
+ * can be created for only one matching char set using this method. If more
+ * than one charset needs to be tried, the caller will need to reset
+ * the InputStream and create InputStreamReaders itself, based on the charset name.
+ *
+ * @return the Reader for the Unicode character data.
+ *
+ * @stable ICU 3.4
+ */
+ public Reader getReader() {
+ InputStream inputStream = fInputStream;
+
+ if (inputStream == null) {
+ inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
+ }
+
+ try {
+ inputStream.reset();
+ return new InputStreamReader(inputStream, getName());
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Create a Java String from Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ *
+ * @return a String created from the converted input data.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString() throws java.io.IOException {
+ return getString(-1);
+
+ }
+
+ /**
+ * Create a Java String from Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ * The length of the returned string is limited to the specified size;
+ * the string will be trunctated to this length if necessary. A limit value of
+ * zero or less is ignored, and treated as no limit.
+ *
+ * @param maxLength The maximium length of the String to be created when the
+ * source of the data is an input stream, or -1 for
+ * unlimited length.
+ * @return a String created from the converted input data.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString(int maxLength) throws java.io.IOException {
+ String result = null;
+ if (fInputStream != null) {
+ StringBuffer sb = new StringBuffer();
+ char[] buffer = new char[1024];
+ Reader reader = getReader();
+ int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
+ int bytesRead = 0;
+
+ while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
+ sb.append(buffer, 0, bytesRead);
+ max -= bytesRead;
+ }
+
+ reader.close();
+
+ return sb.toString();
+ } else {
+ result = new String(fRawInput, getName());
+ }
+ return result;
+
+ }
+
+ /**
+ * Get an indication of the confidence in the charset detected.
+ * Confidence values range from 0-100, with larger numbers indicating
+ * a better match of the input data to the characteristics of the
+ * charset.
+ *
+ * @return the confidence in the charset match
+ *
+ * @stable ICU 3.4
+ */
+ public int getConfidence() {
+ return fConfidence;
+ }
+
+ /**
+ * Return flags indicating what it was about the input data
+ * that caused this charset to be considered as a possible match.
+ * The result is a bitfield containing zero or more of the flags
+ * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
+ * A result of zero means no information is available.
+ * <p>
+ * Note: currently, this method always returns zero.
+ * <p>
+ *
+ * @return the type of match found for this charset.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int getMatchType() {
+// TODO: create a list of enum-like constants for common combinations of types of matches.
+ return 0;
+ }
+
+ /**
+ * Get the name of the detected charset.
+ * The name will be one that can be used with other APIs on the
+ * platform that accept charset names. It is the "Canonical name"
+ * as defined by the class java.nio.charset.Charset; for
+ * charsets that are registered with the IANA charset registry,
+ * this is the MIME-preferred registerd name.
+ *
+ * @see java.nio.charset.Charset
+ * @see java.io.InputStreamReader
+ *
+ * @return The name of the charset.
+ *
+ * @stable ICU 3.4
+ */
+ public String getName() {
+ return fRecognizer.getName();
+ }
+
+ /**
+ * Get the ISO code for the language of the detected charset.
+ *
+ * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
+ *
+ * @stable ICU 3.4
+ */
+ public String getLanguage() {
+ return fRecognizer.getLanguage();
+ }
+
+ /**
+ * Compare to other CharsetMatch objects.
+ * Comparison is based on the match confidence value, which
+ * allows CharsetDetector.detectAll() to order its results.
+ *
+ * @param o the CharsetMatch object to compare against.
+ * @return a negative integer, zero, or a positive integer as the
+ * confidence level of this CharsetMatch
+ * is less than, equal to, or greater than that of
+ * the argument.
+ * @throws ClassCastException if the argument is not a CharsetMatch.
+ * @stable ICU 3.4
+ */
+ public int compareTo(CharsetMatch other) {
+ int compareResult = 0;
+ if (this.fConfidence > other.fConfidence) {
+ compareResult = 1;
+ } else if (this.fConfidence < other.fConfidence) {
+ compareResult = -1;
+ }
+ return compareResult;
+ }
+
+ /**
+ * compare this CharsetMatch to another based on confidence value
+ * @param o the CharsetMatch object to compare against
+ * @return true if equal
+ */
+ public boolean equals(Object o) {
+ if (o instanceof CharsetMatch) {
+ CharsetMatch that = (CharsetMatch) o;
+ return (this.fConfidence == that.fConfidence);
+ }
+
+ return false;
+ }
+
+ /**
+ * generates a hashCode based on the confidence value
+ * @return the hashCode
+ */
+ public int hashCode() {
+ return fConfidence;
+ }
+ // gave us a byte array.
+
+ public String toString() {
+ String s = "Match of " + fRecognizer.getName();
+ if (fRecognizer.getLanguage() != null) {
+ s += " in " + fRecognizer.getLanguage();
+ }
+ s += " with confidence " + fConfidence;
+ return s;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 16835d6..129c9a8 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,163 +1,163 @@
-/*
-*******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
-* others. All Rights Reserved. *
-*******************************************************************************
-*/
-package org.apache.tika.parser.txt;
-
-/**
- * class CharsetRecog_2022 part of the ICU charset detection imlementation.
- * This is a superclass for the individual detectors for
- * each of the detectable members of the ISO 2022 family
- * of encodings.
- * <p/>
- * The separate classes are nested within this class.
- *
- * @internal
- */
-abstract class CharsetRecog_2022 extends CharsetRecognizer {
-
-
- /**
- * Matching function shared among the 2022 detectors JP, CN and KR
- * Counts up the number of legal an unrecognized escape sequences in
- * the sample of text, and computes a score based on the total number &
- * the proportion that fit the encoding.
- *
- * @param text the byte buffer containing text to analyse
- * @param textLen the size of the text in the byte.
- * @param escapeSequences the byte escape sequences to test for.
- * @return match quality, in the range of 0-100.
- */
- int match(byte[] text, int textLen, byte[][] escapeSequences) {
- int i, j;
- int escN;
- int hits = 0;
- int misses = 0;
- int shifts = 0;
- int quality;
- scanInput:
- for (i = 0; i < textLen; i++) {
- if (text[i] == 0x1b) {
- checkEscapes:
- for (escN = 0; escN < escapeSequences.length; escN++) {
- byte[] seq = escapeSequences[escN];
-
- if ((textLen - i) < seq.length) {
- continue checkEscapes;
- }
-
- for (j = 1; j < seq.length; j++) {
- if (seq[j] != text[i + j]) {
- continue checkEscapes;
- }
- }
-
- hits++;
- i += seq.length - 1;
- continue scanInput;
- }
-
- misses++;
- }
-
- if (text[i] == 0x0e || text[i] == 0x0f) {
- // Shift in/out
- shifts++;
- }
- }
-
- if (hits == 0) {
- return 0;
- }
-
- //
- // Initial quality is based on relative proportion of recongized vs.
- // unrecognized escape sequences.
- // All good: quality = 100;
- // half or less good: quality = 0;
- // linear inbetween.
- quality = (100 * hits - 100 * misses) / (hits + misses);
-
- // Back off quality if there were too few escape sequences seen.
- // Include shifts in this computation, so that KR does not get penalized
- // for having only a single Escape sequence, but many shifts.
- if (hits + shifts < 5) {
- quality -= (5 - (hits + shifts)) * 10;
- }
-
- if (quality < 0) {
- quality = 0;
- }
- return quality;
- }
-
-
- static class CharsetRecog_2022JP extends CharsetRecog_2022 {
- private byte[][] escapeSequences = {
- {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
- {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
- {0x1b, 0x24, 0x40}, // JIS C 6226-1978
- {0x1b, 0x24, 0x41}, // GB 2312-80
- {0x1b, 0x24, 0x42}, // JIS X 208-1983
- {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
- {0x1b, 0x28, 0x42}, // ASCII
- {0x1b, 0x28, 0x48}, // JIS-Roman
- {0x1b, 0x28, 0x49}, // Half-width katakana
- {0x1b, 0x28, 0x4a}, // JIS-Roman
- {0x1b, 0x2e, 0x41}, // ISO 8859-1
- {0x1b, 0x2e, 0x46} // ISO 8859-7
- };
-
- String getName() {
- return "ISO-2022-JP";
- }
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
- }
- }
-
- static class CharsetRecog_2022KR extends CharsetRecog_2022 {
- private byte[][] escapeSequences = {
- {0x1b, 0x24, 0x29, 0x43}
- };
-
- String getName() {
- return "ISO-2022-KR";
- }
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
- }
-
- }
-
- static class CharsetRecog_2022CN extends CharsetRecog_2022 {
- private byte[][] escapeSequences = {
- {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
- {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
- {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
- {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
- {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
- {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
- {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
- {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
- {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
- {0x1b, 0x4e}, // SS2
- {0x1b, 0x4f}, // SS3
- };
-
- String getName() {
- return "ISO-2022-CN";
- }
-
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
- }
- }
-
-}
-
+/*
+*******************************************************************************
+* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*/
+package org.apache.tika.parser.txt;
+
+/**
+ * class CharsetRecog_2022 part of the ICU charset detection imlementation.
+ * This is a superclass for the individual detectors for
+ * each of the detectable members of the ISO 2022 family
+ * of encodings.
+ * <p/>
+ * The separate classes are nested within this class.
+ *
+ * @internal
+ */
+abstract class CharsetRecog_2022 extends CharsetRecognizer {
+
+
+ /**
+ * Matching function shared among the 2022 detectors JP, CN and KR
+ * Counts up the number of legal an unrecognized escape sequences in
+ * the sample of text, and computes a score based on the total number &
+ * the proportion that fit the encoding.
+ *
+ * @param text the byte buffer containing text to analyse
+ * @param textLen the size of the text in the byte.
+ * @param escapeSequences the byte escape sequences to test for.
+ * @return match quality, in the range of 0-100.
+ */
+ int match(byte[] text, int textLen, byte[][] escapeSequences) {
+ int i, j;
+ int escN;
+ int hits = 0;
+ int misses = 0;
+ int shifts = 0;
+ int quality;
+ scanInput:
+ for (i = 0; i < textLen; i++) {
+ if (text[i] == 0x1b) {
+ checkEscapes:
+ for (escN = 0; escN < escapeSequences.length; escN++) {
+ byte[] seq = escapeSequences[escN];
+
+ if ((textLen - i) < seq.length) {
+ continue checkEscapes;
+ }
+
+ for (j = 1; j < seq.length; j++) {
+ if (seq[j] != text[i + j]) {
+ continue checkEscapes;
+ }
+ }
+
+ hits++;
+ i += seq.length - 1;
+ continue scanInput;
+ }
+
+ misses++;
+ }
+
+ if (text[i] == 0x0e || text[i] == 0x0f) {
+ // Shift in/out
+ shifts++;
+ }
+ }
+
+ if (hits == 0) {
+ return 0;
+ }
+
+ //
+ // Initial quality is based on relative proportion of recongized vs.
+ // unrecognized escape sequences.
+ // All good: quality = 100;
+ // half or less good: quality = 0;
+ // linear inbetween.
+ quality = (100 * hits - 100 * misses) / (hits + misses);
+
+ // Back off quality if there were too few escape sequences seen.
+ // Include shifts in this computation, so that KR does not get penalized
+ // for having only a single Escape sequence, but many shifts.
+ if (hits + shifts < 5) {
+ quality -= (5 - (hits + shifts)) * 10;
+ }
+
+ if (quality < 0) {
+ quality = 0;
+ }
+ return quality;
+ }
+
+
+ static class CharsetRecog_2022JP extends CharsetRecog_2022 {
+ private byte[][] escapeSequences = {
+ {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
+ {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
+ {0x1b, 0x24, 0x40}, // JIS C 6226-1978
+ {0x1b, 0x24, 0x41}, // GB 2312-80
+ {0x1b, 0x24, 0x42}, // JIS X 208-1983
+ {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
+ {0x1b, 0x28, 0x42}, // ASCII
+ {0x1b, 0x28, 0x48}, // JIS-Roman
+ {0x1b, 0x28, 0x49}, // Half-width katakana
+ {0x1b, 0x28, 0x4a}, // JIS-Roman
+ {0x1b, 0x2e, 0x41}, // ISO 8859-1
+ {0x1b, 0x2e, 0x46} // ISO 8859-7
+ };
+
+ String getName() {
+ return "ISO-2022-JP";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ }
+ }
+
+ static class CharsetRecog_2022KR extends CharsetRecog_2022 {
+ private byte[][] escapeSequences = {
+ {0x1b, 0x24, 0x29, 0x43}
+ };
+
+ String getName() {
+ return "ISO-2022-KR";
+ }
+
+ int match(CharsetDetector det) {
+ return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ }
+
+ }
+
+ static class CharsetRecog_2022CN extends CharsetRecog_2022 {
+ private byte[][] escapeSequences = {
+ {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
+ {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
+ {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
+ {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
+ {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
+ {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
+ {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
+ {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
+ {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
+ {0x1b, 0x4e}, // SS2
+ {0x1b, 0x4f}, // SS3
+ };
+
+ String getName() {
+ return "ISO-2022-CN";
+ }
+
+
+ int match(CharsetDetector det) {
+ return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ }
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index ad69fa0..55a3957 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,99 +1,99 @@
-/**
- * ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- * ******************************************************************************
- */
-package org.apache.tika.parser.txt;
-
-/**
- * Charset recognizer for UTF-8
- *
- * @internal
- */
-class CharsetRecog_UTF8 extends CharsetRecognizer {
-
- String getName() {
- return "UTF-8";
- }
-
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
- */
- int match(CharsetDetector det) {
- boolean hasBOM = false;
- int numValid = 0;
- int numInvalid = 0;
- byte input[] = det.fRawInput;
- int i;
- int trailBytes = 0;
- int confidence;
-
- if (det.fRawLength >= 3 &&
- (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
- hasBOM = true;
- }
-
- // Scan for multi-byte sequences
- for (i = 0; i < det.fRawLength; i++) {
- int b = input[i];
- if ((b & 0x80) == 0) {
- continue; // ASCII
- }
-
- // Hi bit on char found. Figure out how long the sequence should be
- if ((b & 0x0e0) == 0x0c0) {
- trailBytes = 1;
- } else if ((b & 0x0f0) == 0x0e0) {
- trailBytes = 2;
- } else if ((b & 0x0f8) == 0xf0) {
- trailBytes = 3;
- } else {
- numInvalid++;
- if (numInvalid > 5) {
- break;
- }
- trailBytes = 0;
- }
-
- // Verify that we've got the right number of trail bytes in the sequence
- for (; ; ) {
- i++;
- if (i >= det.fRawLength) {
- break;
- }
- b = input[i];
- if ((b & 0xc0) != 0x080) {
- numInvalid++;
- break;
- }
- if (--trailBytes == 0) {
- numValid++;
- break;
- }
- }
-
- }
-
- // Cook up some sort of confidence score, based on presense of a BOM
- // and the existence of valid and/or invalid multi-byte sequences.
- confidence = 0;
- if (hasBOM && numInvalid == 0) {
- confidence = 100;
- } else if (hasBOM && numValid > numInvalid * 10) {
- confidence = 80;
- } else if (numValid > 3 && numInvalid == 0) {
- confidence = 100;
- } else if (numValid > 0 && numInvalid == 0) {
- confidence = 80;
- } else if (numValid == 0 && numInvalid == 0) {
- // Plain ASCII.
- confidence = 10;
- } else if (numValid > numInvalid * 10) {
- // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
- confidence = 25;
- }
- return confidence;
- }
-
-}
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+/**
+ * Charset recognizer for UTF-8
+ *
+ * @internal
+ */
+class CharsetRecog_UTF8 extends CharsetRecognizer {
+
+ String getName() {
+ return "UTF-8";
+ }
+
+ /* (non-Javadoc)
+ * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+ */
+ int match(CharsetDetector det) {
+ boolean hasBOM = false;
+ int numValid = 0;
+ int numInvalid = 0;
+ byte input[] = det.fRawInput;
+ int i;
+ int trailBytes = 0;
+ int confidence;
+
+ if (det.fRawLength >= 3 &&
+ (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
+ hasBOM = true;
+ }
+
+ // Scan for multi-byte sequences
+ for (i = 0; i < det.fRawLength; i++) {
+ int b = input[i];
+ if ((b & 0x80) == 0) {
+ continue; // ASCII
+ }
+
+ // Hi bit on char found. Figure out how long the sequence should be
+ if ((b & 0x0e0) == 0x0c0) {
+ trailBytes = 1;
+ } else if ((b & 0x0f0) == 0x0e0) {
+ trailBytes = 2;
+ } else if ((b & 0x0f8) == 0xf0) {
+ trailBytes = 3;
+ } else {
+ numInvalid++;
+ if (numInvalid > 5) {
+ break;
+ }
+ trailBytes = 0;
+ }
+
+ // Verify that we've got the right number of trail bytes in the sequence
+ for (; ; ) {
+ i++;
+ if (i >= det.fRawLength) {
+ break;
+ }
+ b = input[i];
+ if ((b & 0xc0) != 0x080) {
+ numInvalid++;
+ break;
+ }
+ if (--trailBytes == 0) {
+ numValid++;
+ break;
+ }
+ }
+
+ }
+
+ // Cook up some sort of confidence score, based on presense of a BOM
+ // and the existence of valid and/or invalid multi-byte sequences.
+ confidence = 0;
+ if (hasBOM && numInvalid == 0) {
+ confidence = 100;
+ } else if (hasBOM && numValid > numInvalid * 10) {
+ confidence = 80;
+ } else if (numValid > 3 && numInvalid == 0) {
+ confidence = 100;
+ } else if (numValid > 0 && numInvalid == 0) {
+ confidence = 80;
+ } else if (numValid == 0 && numInvalid == 0) {
+ // Plain ASCII.
+ confidence = 10;
+ } else if (numValid > numInvalid * 10) {
+ // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
+ confidence = 25;
+ }
+ return confidence;
+ }
+
+}