You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/27 00:41:17 UTC
[1/3] tika git commit: TIKA-2041,
upgrade ICU4j's charset detector to avoid multithreading bug.
Repository: tika
Updated Branches:
refs/heads/2.x f89887d2f -> 9f6c71fa6
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 3adaeee..5e8bddc 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -33,15 +33,29 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
+import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -1128,4 +1142,102 @@ public class HtmlParserTest extends TikaTest {
XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
assertContains("\u6709\u4ec0\u4e48\u9700\u8981\u6211\u5e2e\u4f60\u7684", r.xml);
}
+
+ @Test
+ public void testMultiThreadingEncodingDetection() throws Exception {
+ List<EncodingDetector> detectors = new ArrayList<>();
+ ServiceLoader loader =
+ new ServiceLoader(AutoDetectReader.class.getClassLoader());
+ detectors.addAll(loader.loadServiceProviders(EncodingDetector.class));
+ for (EncodingDetector detector : detectors) {
+ testDetector(detector);
+ }
+ }
+
+ private void testDetector(EncodingDetector detector) throws Exception {
+ Map<String, String> encodings = new ConcurrentHashMap<>();
+ List<String> tmpPaths = new ArrayList<>();
+ for (String p : new String[] {
+ "testHTML.html",
+ "testHTML_utf8.html",
+ "russian.cp866.txt",
+ "resume.html",
+ "test.html"
+
+ }) {
+ String testDocPath = "/test-documents/"+p;
+ String encoding = getEncoding(detector, testDocPath);
+ encodings.put(testDocPath, encoding);
+ //add 5 copies to add more chances for failure
+ for (int i = 0; i < 5; i++) {
+ tmpPaths.add(testDocPath);
+ }
+ }
+
+ Collections.shuffle(tmpPaths);
+
+ ArrayBlockingQueue<String> paths = new ArrayBlockingQueue<>(tmpPaths.size());
+ paths.addAll(tmpPaths);
+ int numThreads = paths.size()+1;
+ ExecutorService ex = Executors.newFixedThreadPool(numThreads);
+ CompletionService<String> completionService =
+ new ExecutorCompletionService<>(ex);
+
+ for (int i = 0; i < numThreads; i++) {
+ completionService.submit(new EncodingDetectorRunner(paths, encodings, detector));
+ }
+ int completed = 0;
+ while (completed < numThreads) {
+ Future<String> future = completionService.take();
+
+ if (future.isDone() &&
+ //will trigger ExecutionException if an IOException
+ //was thrown during call
+ EncodingDetectorRunner.DONE.equals(future.get())) {
+ completed++;
+ }
+ }
+ }
+
+ private class EncodingDetectorRunner implements Callable<String> {
+
+ final static String DONE = "done";
+ private final ArrayBlockingQueue<String> paths;
+ private final Map<String, String> encodings;
+ private final EncodingDetector detector;
+ private EncodingDetectorRunner(ArrayBlockingQueue<String> paths,
+ Map<String, String> encodings, EncodingDetector detector) {
+ this.paths = paths;
+ this.encodings = encodings;
+ this.detector = detector;
+ }
+
+ @Override
+ public String call() throws IOException {
+ for (int i = 0; i < encodings.size(); i++) {
+ String p = paths.poll();
+ if (p == null) {
+ return DONE;
+ }
+ String detectedEncoding = getEncoding(detector, p);
+ String trueEncoding = encodings.get(p);
+ assertEquals( "detector class="+detector.getClass() + " : file=",
+ trueEncoding, detectedEncoding);
+
+ }
+ return DONE;
+ }
+ }
+
+ String getEncoding(EncodingDetector detector, String p) throws IOException {
+ try (InputStream is = TikaInputStream.get(getClass().getResourceAsStream(p))) {
+ Charset charset = detector.detect(is, new Metadata());
+ if (charset == null) {
+ return "NULL";
+ } else {
+ return charset.toString();
+ }
+ }
+ }
+
}
[3/3] tika git commit: TIKA-2041,
upgrade ICU4j's charset detector to avoid multithreading bug.
Posted by ta...@apache.org.
TIKA-2041, upgrade ICU4j's charset detector to avoid multithreading bug.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9f6c71fa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9f6c71fa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9f6c71fa
Branch: refs/heads/2.x
Commit: 9f6c71fa69eaae558aff85cfa0dce72bca08fd4e
Parents: f89887d
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 20:41:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 20:41:10 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../apache/tika/parser/txt/CharsetDetector.java | 437 ++++-----
.../apache/tika/parser/txt/CharsetMatch.java | 170 ++--
.../tika/parser/txt/CharsetRecog_2022.java | 28 +-
.../tika/parser/txt/CharsetRecog_UTF8.java | 24 +-
.../tika/parser/txt/CharsetRecog_Unicode.java | 99 +-
.../tika/parser/txt/CharsetRecog_mbcs.java | 44 +-
.../tika/parser/txt/CharsetRecog_sbcs.java | 903 +++++++++----------
.../tika/parser/txt/CharsetRecognizer.java | 31 +-
.../tika/parser/txt/Icu4jEncodingDetector.java | 8 +
.../apache/tika/parser/html/HtmlParserTest.java | 112 +++
11 files changed, 964 insertions(+), 895 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e5b5050..abfbdec 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
Release 1.14 - ???
+ * Upgrade ICU4J charset detection components to fix multithreading
+ bug (TIKA-2041).
+
* Upgrade to Jackcess 2.1.4 (TIKA-2039).
* Maintain more significant digits in cells of "General" format
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index f9df9e0..1ee7f28 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -9,30 +11,37 @@ package org.apache.tika.parser.txt;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
-import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.List;
/**
+ * NOTE: This was copied from ICU4J with two modifications:
+ * Apache Tika added the EBCDIC-500 family of detectors, and
+ * we increased the buffer to 12000 bytes.
+ *
+ * <p>
+ *
* <code>CharsetDetector</code> provides a facility for detecting the
* charset or encoding of character data in an unknown format.
* The input data can either be from an input stream or an array of bytes.
* The result of the detection operation is a list of possibly matching
* charsets, or, for simple use, you can just ask for a Java Reader that
* will will work over the input data.
- * <p/>
+ * <p>
* Character set detection is at best an imprecise operation. The detection
* process will attempt to identify the charset that best matches the characteristics
* of the byte data, but the process is partly statistical in nature, and
* the results can not be guaranteed to always be correct.
- * <p/>
+ * <p>
* For best accuracy in charset detection, the input data should be primarily
* in a single language, and a minimum of a few hundred bytes worth of plain text
* in the language are needed. The detection process will attempt to
* ignore html or xml style markup that could otherwise obscure the content.
- * <p/>
+ * <p>
+ *
* @stable ICU 3.4
*/
public class CharsetDetector {
@@ -47,13 +56,58 @@ public class CharsetDetector {
// actually choose the "real" charset. All assuming that the application just
// wants the data, and doesn't care about a char set name.
- private static final int kBufSize = 12000;
- private static final int MAX_CONFIDENCE = 100;
- private static String[] fCharsetNames;
+ private static final int kBufSize = 12000;//legacy value; more recent value is 8000
/*
* List of recognizers for all charsets known to the implementation.
*/
- private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+ private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
+
+ static {
+ List<CSRecognizerInfo> list = new ArrayList<>();
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
+
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it(), true));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl(), true));
+
+ // IBM 420/424 recognizers are disabled by default
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
+ list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
+
+ ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
+ }
+
/*
* The following items are accessed by individual CharsetRecongizers during
* the recognition process
@@ -61,26 +115,27 @@ public class CharsetDetector {
*/
byte[] fInputBytes = // The text to be checked. Markup will have been
new byte[kBufSize]; // removed if appropriate.
- int fInputLen; // Length of the byte data in fInputText.
+ int fInputLen; // Length of the byte data in fInputBytes.
short fByteStats[] = // byte frequency statistics for the input text.
new short[256]; // Value is percent, not absolute.
boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
false;
String fDeclaredEncoding;
- //
- // Stuff private to CharsetDetector
- //
byte[] fRawInput; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
// If user gave us a stream, it's read to a
// buffer here.
int fRawLength; // Length of data in fRawInput array.
InputStream fInputStream; // User's input stream, or null if the user
- boolean fStripTags = // If true, setText() will strip tags from input text.
+ //
+ // Stuff private to CharsetDetector
+ //
+ private boolean fStripTags = // If true, setText() will strip tags from input text.
false;
+ private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had
/**
- * Constructor
+ * Constructor
*
* @stable ICU 3.4
*/
@@ -88,149 +143,73 @@ public class CharsetDetector {
}
/**
- * Get the names of all char sets that can be recognized by the char set detector.
- *
- * @return an array of the names of all charsets that can be recognized
- * by the charset detector.
+ * Get the names of all charsets supported by <code>CharsetDetector</code> class.
+ * <p>
+ * <b>Note:</b> Multiple different charset encodings in a same family may use
+ * a single shared name in this implementation. For example, this method returns
+ * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+ * (Windows Latin 1). However, actual detection result could be "windows-1252"
+ * when the input data matches Latin 1 code points with any points only available
+ * in "windows-1252".
*
+ * @return an array of the names of all charsets supported by
+ * <code>CharsetDetector</code> class.
* @stable ICU 3.4
*/
public static String[] getAllDetectableCharsets() {
- return fCharsetNames;
- }
-
- /*
- * Create the singleton instances of the CharsetRecognizer classes
- */
- private static ArrayList<CharsetRecognizer> createRecognizers() {
- ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
-
- recognizers.add(new CharsetRecog_UTF8());
-
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
- recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
-
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
- recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
- recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
-
- recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
-
- // Create an array of all charset names, as a side effect.
- // Needed for the getAllDetectableCharsets() API.
- String[] charsetNames = new String[recognizers.size()];
- int out = 0;
-
- for (CharsetRecognizer recognizer : recognizers) {
- String name = recognizer.getName();
-
- if (out == 0 || !name.equals(charsetNames[out - 1])) {
- charsetNames[out++] = name;
- }
+ String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
+ for (int i = 0; i < allCharsetNames.length; i++) {
+ allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
}
-
- fCharsetNames = new String[out];
- System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
-
- return recognizers;
+ return allCharsetNames;
}
/**
* Set the declared encoding for charset detection.
- * The declared encoding of an input text is an encoding obtained
- * from an http header or xml declaration or similar source that
- * can be provided as additional information to the charset detector.
- * A match between a declared encoding and a possible detected encoding
- * will raise the quality of that detected encoding by a small delta,
- * and will also appear as a "reason" for the match.
- * <p/>
+ * The declared encoding of an input text is an encoding obtained
+ * from an http header or xml declaration or similar source that
+ * can be provided as additional information to the charset detector.
+ * A match between a declared encoding and a possible detected encoding
+ * will raise the quality of that detected encoding by a small delta,
+ * and will also appear as a "reason" for the match.
+ * <p>
* A declared encoding that is incompatible with the input data being
* analyzed will not be added to the list of possible encodings.
*
- * @param encoding The declared encoding
- *
+ * @param encoding The declared encoding
* @stable ICU 3.4
*/
public CharsetDetector setDeclaredEncoding(String encoding) {
- setCanonicalDeclaredEncoding(encoding);
+ fDeclaredEncoding = encoding;
return this;
}
+ // Value is rounded up, so zero really means zero occurences.
/**
* Set the input text (byte) data whose charset is to be detected.
*
* @param in the input text of unknown encoding
- *
* @return This CharsetDetector
- *
* @stable ICU 3.4
*/
public CharsetDetector setText(byte[] in) {
fRawInput = in;
fRawLength = in.length;
- MungeInput();
-
return this;
}
- // Value is rounded up, so zero really means zero occurences.
/**
* Set the input text (byte) data whose charset is to be detected.
- * <p/>
- * The input stream that supplies the character data must have markSupported()
- * == true; the charset detection process will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
+ * <p>
+ * The input stream that supplies the character data must have markSupported()
+ * == true; the charset detection process will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
*
* @param in the input text of unknown encoding
- *
* @return This CharsetDetector
- *
* @stable ICU 3.4
*/
@@ -259,21 +238,20 @@ public class CharsetDetector {
/**
* Return the charset that best matches the supplied input data.
- *
+ * <p>
* Note though, that because the detection
* only looks at the start of the input data,
* there is a possibility that the returned charset will fail to handle
* the full set of input data.
- * <p/>
+ * <p>
* Raise an exception if
- * <ul>
- * <li>no charset appears to match the data.</li>
- * <li>no input text has been provided</li>
- * </ul>
+ * <ul>
+ * <li>no charset appears to match the data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
*
* @return a CharsetMatch object representing the best matching charset, or
- * <code>null</code> if there are no matches.
- *
+ * <code>null</code> if there are no matches.
* @stable ICU 3.4
*/
public CharsetMatch detect() {
@@ -291,48 +269,36 @@ public class CharsetDetector {
}
/**
- * Return an array of all charsets that appear to be plausible
- * matches with the input data. The array is ordered with the
- * best quality match first.
- * <p/>
+ * Return an array of all charsets that appear to be plausible
+ * matches with the input data. The array is ordered with the
+ * best quality match first.
+ * <p>
* Raise an exception if
- * <ul>
- * <li>no charsets appear to match the input data.</li>
- * <li>no input text has been provided</li>
- * </ul>
+ * <ul>
+ * <li>no charsets appear to match the input data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
*
* @return An array of CharsetMatch objects representing possibly matching charsets.
- *
* @stable ICU 3.4
*/
public CharsetMatch[] detectAll() {
- CharsetRecognizer csr;
- int i;
- int detectResults;
- int confidence;
- ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+ ArrayList<CharsetMatch> matches = new ArrayList<>();
+
+ MungeInput(); // Strip html markup, collect byte stats.
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
- for (i = 0; i < fCSRecognizers.size(); i++) {
- csr = fCSRecognizers.get(i);
- detectResults = csr.match(this);
- confidence = detectResults & 0x000000ff;
- if (confidence > 0) {
- // Just to be safe, constrain
- confidence = Math.min(confidence, MAX_CONFIDENCE);
-
- // Apply charset hint.
- if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
- // Reduce lack of confidence (delta between "sure" and current) by 50%.
- confidence += (MAX_CONFIDENCE - confidence) / 2;
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+ boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
+ if (active) {
+ CharsetMatch m = rcinfo.recognizer.match(this);
+ if (m != null) {
+ matches.add(m);
}
-
- CharsetMatch m = new CharsetMatch(this, csr, confidence);
- matches.add(m);
}
}
-
Collections.sort(matches); // CharsetMatch compares on confidence
Collections.reverse(matches); // Put best match first.
CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
@@ -343,27 +309,25 @@ public class CharsetDetector {
/**
* Autodetect the charset of an inputStream, and return a Java Reader
* to access the converted input data.
- * <p/>
+ * <p>
* This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
- * <p/>
- * For the input stream that supplies the character data, markSupported()
- * must be true; the charset detection will read a small amount of data,
- * then return the stream to its original position via
- * the InputStream.reset() operation. The exact amount that will
- * be read depends on the characteristics of the data itself.
- *<p/>
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+ * <p>
+ * For the input stream that supplies the character data, markSupported()
+ * must be true; the charset detection will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ * <p>
* Raise an exception if no charsets appear to match the input data.
*
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
+ * @param in The source of the byte data in the unknown charset.
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
* @stable ICU 3.4
*/
public Reader getReader(InputStream in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
+ fDeclaredEncoding = declaredEncoding;
try {
setText(in);
@@ -379,25 +343,24 @@ public class CharsetDetector {
return null;
}
}
+ // gave us a byte array.
/**
* Autodetect the charset of an inputStream, and return a String
* containing the converted input data.
- * <p/>
+ * <p>
* This is a convenience method that is equivalent to
- * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
- *<p/>
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+ * <p>
* Raise an exception if no charsets appear to match the input data.
*
- * @param in The source of the byte data in the unknown charset.
- *
- * @param declaredEncoding A declared encoding for the data, if available,
- * or null or an empty string if none is available.
- *
+ * @param in The source of the byte data in the unknown charset.
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
* @stable ICU 3.4
*/
public String getString(byte[] in, String declaredEncoding) {
- setCanonicalDeclaredEncoding(declaredEncoding);
+ fDeclaredEncoding = declaredEncoding;
try {
setText(in);
@@ -413,30 +376,27 @@ public class CharsetDetector {
return null;
}
}
- // gave us a byte array.
/**
* Test whether or not input filtering is enabled.
*
* @return <code>true</code> if input text will be filtered.
- *
- * @see #enableInputFilter
- *
* @stable ICU 3.4
+ * @see #enableInputFilter
*/
public boolean inputFilterEnabled() {
return fStripTags;
}
+ // been changed from the default. The array index is
+ // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
/**
* Enable filtering of input text. If filtering is enabled,
- * text within angle brackets ("<" and ">") will be removed
+ * text within angle brackets ("<" and ">") will be removed
* before detection.
*
* @param filter <code>true</code> to enable input text filtering.
- *
* @return The previous setting.
- *
* @stable ICU 3.4
*/
public boolean enableInputFilter(boolean filter) {
@@ -447,22 +407,6 @@ public class CharsetDetector {
return previous;
}
- /**
- * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
- *
- * @param encoding - name of character encoding
- */
- private void setCanonicalDeclaredEncoding(String encoding) {
- if ((encoding == null) || encoding.isEmpty()) {
- return;
- }
-
- Charset cs = Charset.forName(encoding);
- if (cs != null) {
- fDeclaredEncoding = cs.name();
- }
- }
-
/*
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
* it by removing what appears to be html markup.
@@ -541,4 +485,83 @@ public class CharsetDetector {
}
}
}
-}
+
+ /**
+ * Get the names of charsets that can be recognized by this CharsetDetector instance.
+ *
+ * @return an array of the names of charsets that can be recognized by this CharsetDetector
+ * instance.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public String[] getDetectableCharsets() {
+ List<String> csnames = new ArrayList<>(ALL_CS_RECOGNIZERS.size());
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+ boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
+ if (active) {
+ csnames.add(rcinfo.recognizer.getName());
+ }
+ }
+ return csnames.toArray(new String[csnames.size()]);
+ }
+
+ /**
+ * Enable or disable individual charset encoding.
+ * A name of charset encoding must be included in the names returned by
+ * {@link #getAllDetectableCharsets()}.
+ *
+ * @param encoding the name of charset encoding.
+ * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
+ * charset encoding.
+ * @return A reference to this <code>CharsetDetector</code>.
+ * @throws IllegalArgumentException when the name of charset encoding is
+ * not supported.
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
+ int modIdx = -1;
+ boolean isDefaultVal = false;
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
+ if (csrinfo.recognizer.getName().equals(encoding)) {
+ modIdx = i;
+ isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
+ break;
+ }
+ }
+ if (modIdx < 0) {
+ // No matching encoding found
+ throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
+ }
+
+ if (fEnabledRecognizers == null && !isDefaultVal) {
+ // Create an array storing the non default setting
+ fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
+
+ // Initialize the array with default info
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
+ }
+ }
+
+ if (fEnabledRecognizers != null) {
+ fEnabledRecognizers[modIdx] = enabled;
+ }
+
+ return this;
+ }
+
+ private static class CSRecognizerInfo {
+ CharsetRecognizer recognizer;
+ boolean isDefaultEnabled;
+
+ CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
+ this.recognizer = recognizer;
+ this.isDefaultEnabled = isDefaultEnabled;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index 22219ab..40a10ce 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and *
+ * Copyright (C) 2005-2016, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -18,63 +20,56 @@ import java.io.Reader;
* as a possible encoding for a set of input data. From an instance of this
* class, you can ask for a confidence level in the charset identification,
* or for Java Reader or String to access the original byte data in Unicode form.
- * <p/>
+ * <p>
* Instances of this class are created only by CharsetDetectors.
- * <p/>
+ * <p>
* Note: this class has a natural ordering that is inconsistent with equals.
- * The natural ordering is based on the match confidence value.
+ * The natural ordering is based on the match confidence value.
*
* @stable ICU 3.4
*/
public class CharsetMatch implements Comparable<CharsetMatch> {
- /**
- * Bit flag indicating the match is based on the the encoding scheme.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int ENCODING_SCHEME = 1;
- /**
- * Bit flag indicating the match is based on the presence of a BOM.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int BOM = 2;
- /**
- * Bit flag indicating he match is based on the declared encoding.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int DECLARED_ENCODING = 4;
- /**
- * Bit flag indicating the match is based on language statistics.
- *
- * @see #getMatchType
- * @stable ICU 3.4
- */
- static public final int LANG_STATISTICS = 8;
//
// Private Data
//
private int fConfidence;
- private CharsetRecognizer fRecognizer;
private byte[] fRawInput = null; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
private int fRawLength; // Length of data in fRawInput array.
private InputStream fInputStream = null; // User's input stream, or null if the user
+ private String fCharsetName; // The name of the charset this CharsetMatch
+ // represents. Filled in by the recognizer.
+ private String fLang; // The language, if one was determined by
/*
* Constructor. Implementation internal
*/
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
- fRecognizer = rec;
fConfidence = conf;
- // The references to the original aplication input data must be copied out
+ // The references to the original application input data must be copied out
+ // of the charset recognizer to here, in case the application resets the
+ // recognizer before using this CharsetMatch.
+ if (det.fInputStream == null) {
+ // We only want the existing input byte data if it came straight from the user,
+ // not if is just the head of a stream.
+ fRawInput = det.fRawInput;
+ fRawLength = det.fRawLength;
+ }
+ fInputStream = det.fInputStream;
+ fCharsetName = rec.getName();
+ fLang = rec.getLanguage();
+ }
+
+ /*
+ * Constructor. Implementation internal
+ */
+ CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
+ fConfidence = conf;
+
+ // The references to the original application input data must be copied out
// of the charset recognizer to here, in case the application resets the
// recognizer before using this CharsetMatch.
if (det.fInputStream == null) {
@@ -84,19 +79,20 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
fRawLength = det.fRawLength;
}
fInputStream = det.fInputStream;
+ fCharsetName = csName;
+ fLang = lang;
}
/**
* Create a java.io.Reader for reading the Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
- * <p/>
+ * <p>
* CAUTION: if the source of the byte data was an InputStream, a Reader
* can be created for only one matching char set using this method. If more
* than one charset needs to be tried, the caller will need to reset
* the InputStream and create InputStreamReaders itself, based on the charset name.
*
* @return the Reader for the Unicode character data.
- *
* @stable ICU 3.4
*/
public Reader getReader() {
@@ -119,10 +115,9 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* to the original byte data supplied to the Charset detect operation.
*
* @return a String created from the converted input data.
- *
* @stable ICU 3.4
*/
- public String getString() throws java.io.IOException {
+ public String getString() throws IOException {
return getString(-1);
}
@@ -138,13 +133,12 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* source of the data is an input stream, or -1 for
* unlimited length.
* @return a String created from the converted input data.
- *
* @stable ICU 3.4
*/
- public String getString(int maxLength) throws java.io.IOException {
+ public String getString(int maxLength) throws IOException {
String result = null;
if (fInputStream != null) {
- StringBuffer sb = new StringBuffer();
+ StringBuilder sb = new StringBuilder();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0 ? Integer.MAX_VALUE : maxLength;
@@ -159,7 +153,17 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
return sb.toString();
} else {
- result = new String(fRawInput, getName());
+ String name = getName();
+ /*
+ * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
+ * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
+ * should be stripped off before creating the string.
+ */
+ int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
+ if (startSuffix > 0) {
+ name = name.substring(0, startSuffix);
+ }
+ result = new String(fRawInput, name);
}
return result;
@@ -172,7 +176,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* charset.
*
* @return the confidence in the charset match
- *
* @stable ICU 3.4
*/
public int getConfidence() {
@@ -180,26 +183,6 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
}
/**
- * Return flags indicating what it was about the input data
- * that caused this charset to be considered as a possible match.
- * The result is a bitfield containing zero or more of the flags
- * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
- * A result of zero means no information is available.
- * <p>
- * Note: currently, this method always returns zero.
- * <p>
- *
- * @return the type of match found for this charset.
- *
- * @draft ICU 3.4
- * @provisional This API might change or be removed in a future release.
- */
- public int getMatchType() {
-// TODO: create a list of enum-like constants for common combinations of types of matches.
- return 0;
- }
-
- /**
* Get the name of the detected charset.
* The name will be one that can be used with other APIs on the
* platform that accept charset names. It is the "Canonical name"
@@ -207,40 +190,38 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
* charsets that are registered with the IANA charset registry,
* this is the MIME-preferred registerd name.
*
- * @see java.nio.charset.Charset
- * @see java.io.InputStreamReader
- *
* @return The name of the charset.
- *
* @stable ICU 3.4
+ * @see java.nio.charset.Charset
+ * @see InputStreamReader
*/
public String getName() {
- return fRecognizer.getName();
+ return fCharsetName;
}
+ // gave us a byte array.
/**
* Get the ISO code for the language of the detected charset.
*
* @return The ISO code for the language or <code>null</code> if the language cannot be determined.
- *
* @stable ICU 3.4
*/
public String getLanguage() {
- return fRecognizer.getLanguage();
+ return fLang;
}
/**
* Compare to other CharsetMatch objects.
* Comparison is based on the match confidence value, which
- * allows CharsetDetector.detectAll() to order its results.
+ * allows CharsetDetector.detectAll() to order its results.
*
- * @param o the CharsetMatch object to compare against.
+ * @param other the CharsetMatch object to compare against.
* @return a negative integer, zero, or a positive integer as the
- * confidence level of this CharsetMatch
- * is less than, equal to, or greater than that of
- * the argument.
+ * confidence level of this CharsetMatch
+ * is less than, equal to, or greater than that of
+ * the argument.
* @throws ClassCastException if the argument is not a CharsetMatch.
- * @stable ICU 3.4
+ * @stable ICU 4.4
*/
public int compareTo(CharsetMatch other) {
int compareResult = 0;
@@ -251,36 +232,5 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
}
return compareResult;
}
-
- /**
- * compare this CharsetMatch to another based on confidence value
- * @param o the CharsetMatch object to compare against
- * @return true if equal
- */
- public boolean equals(Object o) {
- if (o instanceof CharsetMatch) {
- CharsetMatch that = (CharsetMatch) o;
- return (this.fConfidence == that.fConfidence);
- }
-
- return false;
- }
-
- /**
- * generates a hashCode based on the confidence value
- * @return the hashCode
- */
- public int hashCode() {
- return fConfidence;
- }
- // gave us a byte array.
-
- public String toString() {
- String s = "Match of " + fRecognizer.getName();
- if (fRecognizer.getLanguage() != null) {
- s += " in " + fRecognizer.getLanguage();
- }
- s += " with confidence " + fConfidence;
- return s;
- }
-}
+ // the recognizer during the detect operation.
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
index 129c9a8..d4805be 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
-* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
+* Copyright (C) 2005 - 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@@ -11,10 +13,8 @@ package org.apache.tika.parser.txt;
* This is a superclass for the individual detectors for
* each of the detectable members of the ISO 2022 family
* of encodings.
- * <p/>
+ * <p>
* The separate classes are nested within this class.
- *
- * @internal
*/
abstract class CharsetRecog_2022 extends CharsetRecognizer {
@@ -74,7 +74,7 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
//
// Initial quality is based on relative proportion of recongized vs.
- // unrecognized escape sequences.
+ // unrecognized escape sequences.
// All good: quality = 100;
// half or less good: quality = 0;
// linear inbetween.
@@ -114,8 +114,9 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
return "ISO-2022-JP";
}
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -128,10 +129,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
return "ISO-2022-KR";
}
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
-
}
static class CharsetRecog_2022CN extends CharsetRecog_2022 {
@@ -153,11 +154,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
return "ISO-2022-CN";
}
-
- int match(CharsetDetector det) {
- return match(det.fInputBytes, det.fInputLen, escapeSequences);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
}
-
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
index 55a3957..a5100bc 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005 - 2007, International Business Machines Corporation and *
+ * Copyright (C) 2005 - 2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -8,8 +10,6 @@ package org.apache.tika.parser.txt;
/**
* Charset recognizer for UTF-8
- *
- * @internal
*/
class CharsetRecog_UTF8 extends CharsetRecognizer {
@@ -20,7 +20,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
*/
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
boolean hasBOM = false;
int numValid = 0;
int numInvalid = 0;
@@ -50,10 +50,7 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
trailBytes = 3;
} else {
numInvalid++;
- if (numInvalid > 5) {
- break;
- }
- trailBytes = 0;
+ continue;
}
// Verify that we've got the right number of trail bytes in the sequence
@@ -72,7 +69,6 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
break;
}
}
-
}
// Cook up some sort of confidence score, based on presense of a BOM
@@ -87,13 +83,15 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
} else if (numValid > 0 && numInvalid == 0) {
confidence = 80;
} else if (numValid == 0 && numInvalid == 0) {
- // Plain ASCII.
- confidence = 10;
+ // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+ // accepts ASCII with confidence = 10.
+ // TODO: add plain ASCII as an explicitly detected type.
+ confidence = 15;
} else if (numValid > numInvalid * 10) {
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
confidence = 25;
}
- return confidence;
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
index be6455f..a92acc1 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
@@ -1,20 +1,44 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
- * Copyright (C) 1996-2007, International Business Machines Corporation and *
+ * Copyright (C) 1996-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*/
+
package org.apache.tika.parser.txt;
/**
* This class matches UTF-16 and UTF-32, both big- and little-endian. The
* BOM will be used if it is present.
- *
- * @internal
*/
abstract class CharsetRecog_Unicode extends CharsetRecognizer {
+ static int codeUnit16FromBytes(byte hi, byte lo) {
+ return ((hi & 0xff) << 8) | (lo & 0xff);
+ }
+
+ // UTF-16 confidence calculation. Very simple minded, but better than nothing.
+ // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
+ // and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
+ // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
+ // NULs should be rare in actual text.
+ static int adjustConfidence(int codeUnit, int confidence) {
+ if (codeUnit == 0) {
+ confidence -= 10;
+ } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
+ confidence += 10;
+ }
+ if (confidence < 0) {
+ confidence = 0;
+ } else if (confidence > 100) {
+ confidence = 100;
+ }
+ return confidence;
+ }
+
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#getName()
*/
@@ -23,22 +47,36 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
*/
- abstract int match(CharsetDetector det);
+ abstract CharsetMatch match(CharsetDetector det);
static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode {
String getName() {
return "UTF-16BE";
}
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
byte[] input = det.fRawInput;
-
- if (input.length >= 2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
- return 100;
+ int confidence = 10;
+
+ int bytesToCheck = Math.min(input.length, 30);
+ for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+ int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
+ if (charIndex == 0 && codeUnit == 0xFEFF) {
+ confidence = 100;
+ break;
+ }
+ confidence = adjustConfidence(codeUnit, confidence);
+ if (confidence == 0 || confidence == 100) {
+ break;
+ }
}
-
- // TODO: Do some statistics to check for unsigned UTF-16BE
- return 0;
+ if (bytesToCheck < 4 && confidence < 100) {
+ confidence = 0;
+ }
+ if (confidence > 0) {
+ return new CharsetMatch(det, this, confidence);
+ }
+ return null;
}
}
@@ -47,20 +85,29 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
return "UTF-16LE";
}
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
byte[] input = det.fRawInput;
-
- if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE)) {
- // An LE BOM is present.
- if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
- // It is probably UTF-32 LE, not UTF-16
- return 0;
+ int confidence = 10;
+
+ int bytesToCheck = Math.min(input.length, 30);
+ for (int charIndex = 0; charIndex < bytesToCheck - 1; charIndex += 2) {
+ int codeUnit = codeUnit16FromBytes(input[charIndex + 1], input[charIndex]);
+ if (charIndex == 0 && codeUnit == 0xFEFF) {
+ confidence = 100;
+ break;
+ }
+ confidence = adjustConfidence(codeUnit, confidence);
+ if (confidence == 0 || confidence == 100) {
+ break;
}
- return 100;
}
-
- // TODO: Do some statistics to check for unsigned UTF-16LE
- return 0;
+ if (bytesToCheck < 4 && confidence < 100) {
+ confidence = 0;
+ }
+ if (confidence > 0) {
+ return new CharsetMatch(det, this, confidence);
+ }
+ return null;
}
}
@@ -69,7 +116,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
abstract String getName();
- int match(CharsetDetector det) {
+ CharsetMatch match(CharsetDetector det) {
byte[] input = det.fRawInput;
int limit = (det.fRawLength / 4) * 4;
int numValid = 0;
@@ -78,7 +125,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
int confidence = 0;
if (limit == 0) {
- return 0;
+ return null;
}
if (getChar(input, 0) == 0x0000FEFF) {
hasBOM = true;
@@ -110,7 +157,7 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
confidence = 25;
}
- return confidence;
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -136,4 +183,4 @@ abstract class CharsetRecog_Unicode extends CharsetRecognizer {
return "UTF-32LE";
}
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
index 35d2b4f..3c38cd0 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
****************************************************************************
- * Copyright (C) 2005-2008, International Business Machines Corporation and *
+ * Copyright (C) 2005-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
****************************************************************************
*
@@ -20,8 +22,6 @@ import java.util.Arrays;
* CharsetDetector class and kept in the global list of available
* encodings to be checked. The specific encoding being recognized
* is determined by subclass.
- *
- * @internal
*/
abstract class CharsetRecog_mbcs extends CharsetRecognizer {
@@ -46,7 +46,8 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* bits 8-15: The match reason, an enum-like value.
*/
int match(CharsetDetector det, int[] commonChars) {
- int singleByteCharCount = 0;
+ @SuppressWarnings("unused")
+ int singleByteCharCount = 0; //TODO Do we really need this?
int doubleByteCharCount = 0;
int commonCharCount = 0;
int badCharCount = 0;
@@ -132,7 +133,7 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* Get the next character (however many bytes it is) from the input data
* Subclasses for specific charset encodings must implement this function
* to get characters according to the rules of their encoding scheme.
- * <p/>
+ * <p>
* This function is not a method of class iteratedChar only because
* that would require a lot of extra derived classes, which is awkward.
*
@@ -156,14 +157,12 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
//
static class iteratedChar {
int charValue = 0; // 1-4 bytes from the raw input data
- int index = 0;
int nextIndex = 0;
boolean error = false;
boolean done = false;
void reset() {
charValue = 0;
- index = -1;
nextIndex = 0;
error = false;
done = false;
@@ -195,7 +194,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte;
firstByte = it.charValue = it.nextByte(det);
@@ -219,8 +217,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return true;
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
String getName() {
@@ -255,7 +254,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte;
firstByte = it.charValue = it.nextByte(det);
@@ -282,8 +280,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return true;
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
String getName() {
@@ -311,7 +310,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* packed into an int.
*/
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte = 0;
int secondByte = 0;
@@ -392,8 +390,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return "EUC-JP";
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
public String getLanguage() {
@@ -425,8 +424,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return "EUC-KR";
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
public String getLanguage() {
@@ -462,7 +462,6 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
* packed into an int.
*/
boolean nextChar(iteratedChar it, CharsetDetector det) {
- it.index = it.nextIndex;
it.error = false;
int firstByte = 0;
int secondByte = 0;
@@ -519,8 +518,9 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
return "GB18030";
}
- int match(CharsetDetector det) {
- return match(det, commonChars);
+ CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, commonChars);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
public String getLanguage() {
@@ -529,4 +529,4 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
}
-}
+}
\ No newline at end of file
[2/3] tika git commit: TIKA-2041,
upgrade ICU4j's charset detector to avoid multithreading bug.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
index 87f831b..32824be 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
@@ -1,57 +1,37 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/*
****************************************************************************
- * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * Copyright (C) 2005-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
************************************************************************** *
*
*/
-package org.apache.tika.parser.txt;
-import java.nio.ByteBuffer;
+package org.apache.tika.parser.txt;
/**
* This class recognizes single-byte encodings. Because the encoding scheme is so
* simple, language statistics are used to do the matching.
- * <p/>
- * The Recognizer works by first mapping from bytes in the encoding under test
- * into that Recognizer's ngram space. Normally this means performing a
- * lowercase, and excluding codepoints that don't correspond to numbers of
- * letters. (Accented letters may or may not be ignored or normalised, depending
- * on the needs of the ngrams)
- * Then, ngram analysis is run against the transformed text, and a confidence
- * is calculated.
- * <p/>
- * For many of our Recognizers, we have one ngram set per language in each
- * encoding, and do a simultanious language+charset detection.
- * <p/>
- * When adding new Recognizers, the easiest way is to byte map to an existing
- * encoding for which we have ngrams, excluding non text, and re-use the ngrams.
- *
- * @internal
*/
abstract class CharsetRecog_sbcs extends CharsetRecognizer {
- protected boolean haveC1Bytes = false;
-
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#getName()
*/
abstract String getName();
- /* (non-Javadoc)
- * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
- */
- abstract int match(CharsetDetector det);
-
int match(CharsetDetector det, int[] ngrams, byte[] byteMap) {
return match(det, ngrams, byteMap, (byte) 0x20);
}
int match(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) {
NGramParser parser = new NGramParser(ngrams, byteMap);
+ return parser.parse(det, spaceChar);
+ }
- haveC1Bytes = det.fC1Bytes;
-
+ int matchIBM420(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) {
+ NGramParser_IBM420 parser = new NGramParser_IBM420(ngrams, byteMap);
return parser.parse(det, spaceChar);
}
@@ -59,17 +39,14 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
// private static final int N_GRAM_SIZE = 3;
private static final int N_GRAM_MASK = 0xFFFFFF;
- private int byteIndex = 0;
+ protected int byteIndex = 0;
+ protected byte[] byteMap;
+ protected byte spaceChar;
private int ngram = 0;
-
private int[] ngramList;
- private byte[] byteMap;
-
private int ngramCount;
private int hitCount;
- private byte spaceChar;
-
public NGramParser(int[] theNgramList, byte[] theByteMap) {
ngramList = theNgramList;
byteMap = theByteMap;
@@ -129,7 +106,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
}
- private void addByte(int b) {
+ protected void addByte(int b) {
ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
lookup(ngram);
}
@@ -142,14 +119,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return det.fInputBytes[byteIndex++] & 0xFF;
}
- public int parse(CharsetDetector det) {
- return parse(det, (byte) 0x20);
- }
-
- public int parse(CharsetDetector det, byte spaceCh) {
+ protected void parseCharacters(CharsetDetector det) {
int b;
boolean ignoreSpace = false;
- this.spaceChar = spaceCh;
while ((b = nextByte(det)) >= 0) {
byte mb = byteMap[b];
@@ -161,13 +133,21 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
}
ignoreSpace = (mb == spaceChar);
- } else if (mb == 0 && b != 0) {
- // Indicates an invalid character in the charset
- // Bump the ngram count up a bit to indicate uncertainty
- ngramCount += 4;
}
}
+ }
+
+ public int parse(CharsetDetector det) {
+ return parse(det, (byte) 0x20);
+ }
+
+ public int parse(CharsetDetector det, byte spaceCh) {
+
+ this.spaceChar = spaceCh;
+
+ parseCharacters(det);
+
// TODO: Is this OK? The buffer could have ended in the middle of a word...
addByte(spaceChar);
@@ -187,218 +167,262 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
}
}
- abstract static class CharsetRecog_8859_1 extends CharsetRecog_sbcs {
- protected static byte[] byteMap = {
-/* 0x00-0x07 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x10-0x17 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x18-0x1f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x20-0x27 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
-/* 0x28-0x2f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x30-0x37 */ (byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37,
-/* 0x38-0x3f */ (byte) 0x38, (byte) 0x39, (byte) 0x40, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x40-0x47 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-/* 0x48-0x4f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 0x50-0x57 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-/* 0x58-0x0f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x60-0x67 */ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
-/* 0x68-0x6f */ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 0x70-0x77 */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
-/* 0x78-0x7f */ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x80-0x87 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x88-0x8f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x90-0x97 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0x98-0x9f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xa0-0xa7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xa8-0xaf */ (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xb0-0xb7 */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
-/* 0xb8-0xbf */ (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
-/* 0xc0-0xc7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-/* 0xc8-0xcf */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* 0xd0-0xd7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-/* 0xd8-0xdf */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
-/* 0xe0-0xe7 */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
-/* 0xe8-0xef */ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* 0xf0-0xf7 */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
-/* 0xf8-0xff */ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
+ static class NGramParser_IBM420 extends NGramParser {
+ protected static byte[] unshapeMap = {
+/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
+/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
+/* 5- */ (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
+/* 6- */ (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+/* 7- */ (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
+/* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x80, (byte) 0x8B, (byte) 0x8B, (byte) 0x8D, (byte) 0x8D, (byte) 0x8F,
+/* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9E, (byte) 0x9E,
+/* A- */ (byte) 0x9E, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x9E, (byte) 0xAB, (byte) 0xAB, (byte) 0xAD, (byte) 0xAD, (byte) 0xAF,
+/* B- */ (byte) 0xAF, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9, (byte) 0xB1, (byte) 0xBB, (byte) 0xBB, (byte) 0xBD, (byte) 0xBD, (byte) 0xBF,
+/* C- */ (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xBF, (byte) 0xCC, (byte) 0xBF, (byte) 0xCE, (byte) 0xCF,
+/* D- */ (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDA, (byte) 0xDC, (byte) 0xDC, (byte) 0xDC, (byte) 0xDF,
+/* E- */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+/* F- */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
};
+ private byte alef = 0x00;
- public String getName() {
- return haveC1Bytes ? "windows-1252" : "ISO-8859-1";
- }
- }
-
- static class CharsetRecog_8859_1_da extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
- 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
- 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
- 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
- };
- public String getLanguage() {
- return "da";
+ public NGramParser_IBM420(int[] theNgramList, byte[] theByteMap) {
+ super(theNgramList, theByteMap);
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ private byte isLamAlef(byte b) {
+ if (b == (byte) 0xb2 || b == (byte) 0xb3) {
+ return (byte) 0x47;
+ } else if (b == (byte) 0xb4 || b == (byte) 0xb5) {
+ return (byte) 0x49;
+ } else if (b == (byte) 0xb8 || b == (byte) 0xb9) {
+ return (byte) 0x56;
+ } else
+ return (byte) 0x00;
}
- }
-
- static class CharsetRecog_8859_1_de extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
- 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
- 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
- 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
- };
- public String getLanguage() {
- return "de";
- }
+ /*
+ * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
+ * because CharsetDetector is dealing with bytes not Unicode code points. We could
+ * convert the bytes to Unicode code points but that would leave us dependent
+ * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
+ * of JDK can produce different results and therefore is also avoided.
+ */
+ private int nextByte(CharsetDetector det) {
+ if (byteIndex >= det.fInputLen || det.fInputBytes[byteIndex] == 0) {
+ return -1;
+ }
+ int next;
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
+ alef = isLamAlef(det.fInputBytes[byteIndex]);
+ if (alef != (byte) 0x00)
+ next = 0xB1 & 0xFF;
+ else
+ next = unshapeMap[det.fInputBytes[byteIndex] & 0xFF] & 0xFF;
- static class CharsetRecog_8859_1_en extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
- 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
- 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
- 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
- };
+ byteIndex++;
- public String getLanguage() {
- return "en";
+ return next;
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
+ protected void parseCharacters(CharsetDetector det) {
+ int b;
+ boolean ignoreSpace = false;
- static class CharsetRecog_8859_1_es extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
- 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
- 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
- 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
- };
+ while ((b = nextByte(det)) >= 0) {
+ byte mb = byteMap[b];
- public String getLanguage() {
- return "es";
- }
+ // TODO: 0x20 might not be a space in all character sets...
+ if (mb != 0) {
+ if (!(mb == spaceChar && ignoreSpace)) {
+ addByte(mb);
+ }
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
+ ignoreSpace = (mb == spaceChar);
+ }
+ if (alef != (byte) 0x00) {
+ mb = byteMap[alef & 0xFF];
- static class CharsetRecog_8859_1_fr extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
- 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
- 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
- 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
- };
+ // TODO: 0x20 might not be a space in all character sets...
+ if (mb != 0) {
+ if (!(mb == spaceChar && ignoreSpace)) {
+ addByte(mb);
+ }
- public String getLanguage() {
- return "fr";
- }
+ ignoreSpace = (mb == spaceChar);
+ }
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ }
+ }
}
}
- static class CharsetRecog_8859_1_it extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
- 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
- 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
- 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
- };
-
- public String getLanguage() {
- return "it";
- }
+ static class NGramsPlusLang {
+ int[] fNGrams;
+ String fLang;
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ NGramsPlusLang(String la, int[] ng) {
+ fLang = la;
+ fNGrams = ng;
}
}
- static class CharsetRecog_8859_1_nl extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
- 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
- 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
- 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
+ static class CharsetRecog_8859_1 extends CharsetRecog_sbcs {
+ protected static byte[] byteMap = {
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
+ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
+ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
+ (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
+ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
+ (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20,
+ (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
+ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
+ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
+ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF,
+ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
+ (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
+ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20,
+ (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
};
- public String getLanguage() {
- return "nl";
- }
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
+ private static NGramsPlusLang[] ngrams_8859_1 = new NGramsPlusLang[]{
+ new NGramsPlusLang(
+ "da",
+ new int[]{
+ 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
+ 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
+ 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
+ 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
+ }),
+ new NGramsPlusLang(
+ "de",
+ new int[]{
+ 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
+ 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
+ 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
+ 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
+ }),
+ new NGramsPlusLang(
+ "en",
+ new int[]{
+ 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
+ 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
+ 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
+ 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
+ }),
+
+ new NGramsPlusLang(
+ "es",
+ new int[]{
+ 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
+ 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
+ 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
+ 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
+ }),
+
+ new NGramsPlusLang(
+ "fr",
+ new int[]{
+ 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
+ 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
+ 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
+ 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
+ }),
+
+ new NGramsPlusLang(
+ "it",
+ new int[]{
+ 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
+ 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
+ 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
+ 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
+ }),
+
+ new NGramsPlusLang(
+ "nl",
+ new int[]{
+ 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
+ 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
+ 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
+ 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
+ }),
+
+ new NGramsPlusLang(
+ "no",
+ new int[]{
+ 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
+ 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
+ 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
+ 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
+ }),
+
+ new NGramsPlusLang(
+ "pt",
+ new int[]{
+ 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
+ 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
+ 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
+ 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
+
+ }),
+
+ new NGramsPlusLang(
+ "sv",
+ new int[]{
+ 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
+ 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
+ 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
+ 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
+ }),
- static class CharsetRecog_8859_1_no extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
- 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
- 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
- 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
};
- public String getLanguage() {
- return "no";
- }
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ String name = det.fC1Bytes ? "windows-1252" : "ISO-8859-1";
+ int bestConfidenceSoFar = -1;
+ String lang = null;
+ for (NGramsPlusLang ngl : ngrams_8859_1) {
+ int confidence = match(det, ngl.fNGrams, byteMap);
+ if (confidence > bestConfidenceSoFar) {
+ bestConfidenceSoFar = confidence;
+ lang = ngl.fLang;
+ }
+ }
+ return bestConfidenceSoFar <= 0 ? null : new CharsetMatch(det, this, bestConfidenceSoFar, name, lang);
}
- }
- static class CharsetRecog_8859_1_pt extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
- 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
- 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
- 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
- };
- public String getLanguage() {
- return "pt";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public String getName() {
+ return "ISO-8859-1";
}
}
- static class CharsetRecog_8859_1_sv extends CharsetRecog_8859_1 {
- private static int[] ngrams = {
- 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
- 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
- 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
- 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
- };
-
- public String getLanguage() {
- return "sv";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
- abstract static class CharsetRecog_8859_2 extends CharsetRecog_sbcs {
+ static class CharsetRecog_8859_2 extends CharsetRecog_sbcs {
protected static byte[] byteMap = {
(byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
(byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
@@ -434,78 +458,61 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
(byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20,
};
- public String getName() {
- return haveC1Bytes ? "windows-1250" : "ISO-8859-2";
- }
- }
-
- static class CharsetRecog_8859_2_cs extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
- 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
- 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
- 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
- };
-
- public String getLanguage() {
- return "cs";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_8859_2_hu extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
- 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
- 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
- 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
+ private static NGramsPlusLang[] ngrams_8859_2 = new NGramsPlusLang[]{
+ new NGramsPlusLang(
+ "cs",
+ new int[]{
+ 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
+ 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
+ 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
+ 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
+ }),
+ new NGramsPlusLang(
+ "hu",
+ new int[]{
+ 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
+ 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
+ 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
+ 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
+ }),
+ new NGramsPlusLang(
+ "pl",
+ new int[]{
+ 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
+ 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
+ 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
+ 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
+ }),
+ new NGramsPlusLang(
+ "ro",
+ new int[]{
+ 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
+ 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
+ 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
+ 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
+ })
};
- public String getLanguage() {
- return "hu";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ String name = det.fC1Bytes ? "windows-1250" : "ISO-8859-2";
+ int bestConfidenceSoFar = -1;
+ String lang = null;
+ for (NGramsPlusLang ngl : ngrams_8859_2) {
+ int confidence = match(det, ngl.fNGrams, byteMap);
+ if (confidence > bestConfidenceSoFar) {
+ bestConfidenceSoFar = confidence;
+ lang = ngl.fLang;
+ }
+ }
+ return bestConfidenceSoFar <= 0 ? null : new CharsetMatch(det, this, bestConfidenceSoFar, name, lang);
}
- }
- static class CharsetRecog_8859_2_pl extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
- 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
- 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
- 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
- };
-
- public String getLanguage() {
- return "pl";
+ public String getName() {
+ return "ISO-8859-2";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
}
- static class CharsetRecog_8859_2_ro extends CharsetRecog_8859_2 {
- private static int[] ngrams = {
- 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
- 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
- 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
- 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
- };
-
- public String getLanguage() {
- return "ro";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
abstract static class CharsetRecog_8859_5 extends CharsetRecog_sbcs {
protected static byte[] byteMap = {
@@ -560,8 +567,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "ru";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -618,8 +626,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "ar";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -660,7 +669,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
};
public String getName() {
- return haveC1Bytes ? "windows-1253" : "ISO-8859-7";
+ return "ISO-8859-7";
}
}
@@ -676,8 +685,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "el";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ String name = det.fC1Bytes ? "windows-1253" : "ISO-8859-7";
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "el");
}
}
@@ -718,7 +729,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
};
public String getName() {
- return haveC1Bytes ? "windows-1255" : "ISO-8859-8";
+ return "ISO-8859-8";
}
}
@@ -731,15 +742,17 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
};
public String getName() {
- return haveC1Bytes ? "windows-1255" : /*"ISO-8859-8-I"*/ "ISO-8859-8";
+ return "ISO-8859-8-I";
}
public String getLanguage() {
return "he";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I";
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
}
}
@@ -755,8 +768,11 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "he";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8";
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
+
}
}
@@ -797,7 +813,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
};
public String getName() {
- return haveC1Bytes ? "windows-1254" : "ISO-8859-9";
+ return "ISO-8859-9";
}
}
@@ -813,8 +829,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "tr";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ String name = det.fC1Bytes ? "windows-1254" : "ISO-8859-9";
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "tr");
}
}
@@ -869,65 +887,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "ru";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
- }
- }
-
- static class CharsetRecog_IBM866_ru extends CharsetRecog_sbcs {
- private static int[] ngrams = {
- 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
- 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
- 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
- 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
- };
-
- // bytemap converts cp866 chars to cp1251 chars, so ngrams are still unchanged
- private static byte[] byteMap = {
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
- (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
- (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
- (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
- (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
- (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- (byte) 0xB8, (byte) 0xB8, (byte) 0xBA, (byte) 0xBA, (byte) 0xBF, (byte) 0xBF, (byte) 0xA2, (byte) 0xA2,
- (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
- };
-
- public String getName() {
- return "IBM866";
- }
-
- public String getLanguage() {
- return "ru";
- }
-
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -982,8 +944,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "ar";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -1038,29 +1001,30 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "ru";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
abstract static class CharsetRecog_IBM424_he extends CharsetRecog_sbcs {
protected static byte[] byteMap = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
-/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */ (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 7- */ (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40,
-/* 8- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 9- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* B- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 4- */ (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 7- */ (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40,
+/* 8- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* 9- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* B- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
+/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
/* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
};
@@ -1081,8 +1045,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "IBM424_rtl";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap, (byte) 0x40);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap, (byte) 0x40);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -1099,12 +1064,14 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "IBM424_ltr";
}
- public int match(CharsetDetector det) {
- return match(det, ngrams, byteMap, (byte) 0x40);
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, ngrams, byteMap, (byte) 0x40);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
abstract static class CharsetRecog_IBM420_ar extends CharsetRecog_sbcs {
+
protected static byte[] byteMap = {
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
@@ -1124,85 +1091,12 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xEA, (byte) 0xEB, (byte) 0x40, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
/* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x40,
};
- protected static byte[] unshapeMap = {
-/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
-/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40,
-/* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
-/* 5- */ (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
-/* 6- */ (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
-/* 7- */ (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
-/* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x80, (byte) 0x8B, (byte) 0x8B, (byte) 0x8D, (byte) 0x8D, (byte) 0x8F,
-/* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9E, (byte) 0x9E,
-/* A- */ (byte) 0x9E, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x9E, (byte) 0xAB, (byte) 0xAB, (byte) 0xAD, (byte) 0xAD, (byte) 0xAF,
-/* B- */ (byte) 0xAF, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9, (byte) 0xB1, (byte) 0xBB, (byte) 0xBB, (byte) 0xBD, (byte) 0xBD, (byte) 0xBF,
-/* C- */ (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xBF, (byte) 0xCC, (byte) 0xBF, (byte) 0xCE, (byte) 0xCF,
-/* D- */ (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDA, (byte) 0xDC, (byte) 0xDC, (byte) 0xDC, (byte) 0xDF,
-/* E- */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
-/* F- */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF,
- };
- //arabic shaping class, method shape/unshape
- //protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
- protected byte[] prev_fInputBytes = null;
+
public String getLanguage() {
return "ar";
}
- protected void matchInit(CharsetDetector det) {
- prev_fInputBytes = det.fInputBytes.clone();
- byte bb[] = unshape(det.fInputBytes);
- det.setText(bb);
- }
-
- /*
- * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
- * because CharsetDetector is dealing with bytes not Unicode code points. We could
- * convert the bytes to Unicode code points but that would leave us dependent
- * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
- * of JDK can produce different results and therefore is also avoided.
- */
- private byte[] unshape(byte[] inputBytes) {
- byte resultByteArr[] = unshapeLamAlef(inputBytes);
-
- for (int i = 0; i < inputBytes.length; i++) {
- resultByteArr[i] = unshapeMap[resultByteArr[i] & 0xFF];
- }
- return resultByteArr;
- }
-
- private byte[] unshapeLamAlef(byte[] inputBytes) {
- ByteBuffer resultBigBuffer = ByteBuffer.allocate(inputBytes.length * 2);
- ByteBuffer resultBuffer;
- byte unshapedLamAlef[] = {(byte) 0xb1, (byte) 0x56};
-
- for (byte inputByte : inputBytes) {
- if (isLamAlef(inputByte))
- resultBigBuffer.put(unshapedLamAlef);
- else
- resultBigBuffer.put(inputByte);
- }
- resultBuffer = ByteBuffer.allocate(resultBigBuffer.position());
- resultBuffer.put(resultBigBuffer.array(), 0, resultBigBuffer.position());
- return resultBuffer.array();
- }
-
- private boolean isLamAlef(byte b) {
- // Return true if byte is any of these:
- //
- // {(byte)0xb2,(byte)0xb3,(byte)0xb4,(byte)0xb5,(byte)0xb7,(byte)0xb8}
- //
- // NOTE: 0xb2 is -78; 0xb8 is -72:
- return (b <= (byte) 0xb8) && (b >= (byte) 0xb2) && (b != (byte) 0xb6);
- }
-
- protected void matchFinish(CharsetDetector det) {
- if (prev_fInputBytes != null)
- det.setText(prev_fInputBytes);
- }
-
}
static class CharsetRecog_IBM420_ar_rtl extends CharsetRecog_IBM420_ar {
@@ -1217,11 +1111,9 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "IBM420_rtl";
}
- public int match(CharsetDetector det) {
- matchInit(det);
- int result = match(det, ngrams, byteMap, (byte) 0x40);
- matchFinish(det);
- return result;
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = matchIBM420(det, ngrams, byteMap, (byte) 0x40);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
}
@@ -1238,19 +1130,19 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
return "IBM420_ltr";
}
- public int match(CharsetDetector det) {
- matchInit(det);
- int result = match(det, ngrams, byteMap, (byte) 0x40);
- matchFinish(det);
- return result;
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = matchIBM420(det, ngrams, byteMap, (byte) 0x40);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
}
+
}
static abstract class CharsetRecog_EBCDIC_500 extends CharsetRecog_sbcs {
+
// This maps EBCDIC 500 codepoints onto either space (not of interest), or a lower
// case ISO_8859_1 number/letter/accented-letter codepoint for ngram matching
// Because we map to ISO_8859_1, we can re-use the ngrams from those detectors
- // To avoid mis-detection, we skip many of the control characters in the 0x00-0x3f range
+ // To avoid mis-detection, we skip many of the control characters in the 0x00-0x3f range
protected static byte[] byteMap = {
/* 0x00-0x07 */ (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00,
/* 0x08-0x0f */ (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20,
@@ -1285,69 +1177,106 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
/* 0xf0-0xf7 */ (byte) '0', (byte) '1', (byte) '2', (byte) '3', (byte) '4', (byte) '5', (byte) '6', (byte) '7',
/* 0xf8-0xff */ (byte) '8', (byte) '9', (byte) 0x20, (byte) 0xfb, (byte) 0xfc, (byte) 0xf9, (byte) 0xfa, (byte) 0x20,
};
+ private final int langIndex;
+
+ protected CharsetRecog_EBCDIC_500(int langIndex) {
+ this.langIndex = langIndex;
+ }
+
+ /**
+ * @param lang language to find
+ * @return the index into CharsetRecog_8859_1.ngrams_8859_1 that matches his language;
+ * throws IllegalArgumentException if language can't be found
+ */
+ static int findLangIndex(String lang) {
+ for (int i = 0; i < CharsetRecog_8859_1.ngrams_8859_1.length; i++) {
+ NGramsPlusLang ngpl = CharsetRecog_8859_1.ngrams_8859_1[i];
+ if (ngpl.fLang.equals(lang)) {
+ return i;
+ }
+ }
+ throw new IllegalArgumentException("can't find language: " + lang);
+ }
public String getName() {
return "IBM500";
}
+
+ public CharsetMatch match(CharsetDetector det) {
+ int confidence = match(det, CharsetRecog_8859_1.ngrams_8859_1[getLangIndex()].fNGrams, byteMap);
+ return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
+ }
+
+ int getLangIndex() {
+ return langIndex;
+ }
+
+
}
+ //The EBCDIC codes were removed from ICU4js trunk as of at least July 26, 2016.
static class CharsetRecog_EBCDIC_500_en extends CharsetRecog_EBCDIC_500 {
+
+ CharsetRecog_EBCDIC_500_en() {
+ super(findLangIndex("en"));
+ }
+
public String getLanguage() {
return "en";
}
- public int match(CharsetDetector det) {
- return match(det, CharsetRecog_8859_1_en.ngrams, byteMap);
- }
+
}
static class CharsetRecog_EBCDIC_500_de extends CharsetRecog_EBCDIC_500 {
+ CharsetRecog_EBCDIC_500_de() {
+ super(findLangIndex("de"));
+ }
+
public String getLanguage() {
return "de";
}
- public int match(CharsetDetector det) {
- return match(det, CharsetRecog_8859_1_de.ngrams, byteMap);
- }
}
static class CharsetRecog_EBCDIC_500_fr extends CharsetRecog_EBCDIC_500 {
- public String getLanguage() {
- return "fr";
+ CharsetRecog_EBCDIC_500_fr() {
+ super(findLangIndex("fr"));
}
- public int match(CharsetDetector det) {
- return match(det, CharsetRecog_8859_1_fr.ngrams, byteMap);
+ public String getLanguage() {
+ return "fr";
}
}
static class CharsetRecog_EBCDIC_500_es extends CharsetRecog_EBCDIC_500 {
- public String getLanguage() {
- return "es";
+ CharsetRecog_EBCDIC_500_es() {
+ super(findLangIndex("es"));
}
- public int match(CharsetDetector det) {
- return match(det, CharsetRecog_8859_1_es.ngrams, byteMap);
+ public String getLanguage() {
+ return "es";
}
}
static class CharsetRecog_EBCDIC_500_it extends CharsetRecog_EBCDIC_500 {
- public String getLanguage() {
- return "it";
+ CharsetRecog_EBCDIC_500_it() {
+ super(findLangIndex("it"));
}
- public int match(CharsetDetector det) {
- return match(det, CharsetRecog_8859_1_it.ngrams, byteMap);
+ public String getLanguage() {
+ return "it";
}
}
static class CharsetRecog_EBCDIC_500_nl extends CharsetRecog_EBCDIC_500 {
+ CharsetRecog_EBCDIC_500_nl() {
+ super(findLangIndex("nl"));
+ }
+
public String getLanguage() {
return "nl";
}
- public int match(CharsetDetector det) {
- return match(det, CharsetRecog_8859_1_nl.ngrams, byteMap);
- }
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
index e1a0ff0..7834053 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
@@ -1,6 +1,8 @@
+// � 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
/**
* ******************************************************************************
- * Copyright (C) 2005, International Business Machines Corporation and *
+ * Copyright (C) 2005-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
* ******************************************************************************
*/
@@ -9,28 +11,28 @@ package org.apache.tika.parser.txt;
/**
* Abstract class for recognizing a single charset.
* Part of the implementation of ICU's CharsetDetector.
- *
+ * <p>
* Each specific charset that can be recognized will have an instance
* of some subclass of this class. All interaction between the overall
* CharsetDetector and the stuff specific to an individual charset happens
* via the interface provided here.
- *
- * Instances of CharsetDetector DO NOT have or maintain
+ * <p>
+ * Instances of CharsetDetector DO NOT have or maintain
* state pertaining to a specific match or detect operation.
* The WILL be shared by multiple instances of CharsetDetector.
* They encapsulate const charset-specific information.
- *
- * @internal
*/
abstract class CharsetRecognizer {
/**
* Get the IANA name of this charset.
+ *
* @return the charset name.
*/
abstract String getName();
/**
* Get the ISO language code for this charset.
+ *
* @return the language code, or <code>null</code> if the language cannot be determined.
*/
public String getLanguage() {
@@ -39,16 +41,13 @@ abstract class CharsetRecognizer {
/**
* Test the match of this charset with the input text data
- * which is obtained via the CharsetDetector object.
+ * which is obtained via the CharsetDetector object.
*
- * @param det The CharsetDetector, which contains the input text
- * to be checked for being in this charset.
- * @return Two values packed into one int (Damn java, anyhow)
- * <br/>
- * bits 0-7: the match confidence, ranging from 0-100
- * <br/>
- * bits 8-15: The match reason, an enum-like value.
+ * @param det The CharsetDetector, which contains the input text
+ * to be checked for being in this charset.
+ * @return A CharsetMatch object containing details of match
+ * with this charset, or null if there was no match.
*/
- abstract int match(CharsetDetector det);
+ abstract CharsetMatch match(CharsetDetector det);
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/9f6c71fa/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
index ce792dc..58ba1ac 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -25,6 +25,14 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
+/**
+ * Tika wrapper around ICU4J's CharsetDetector.
+ * <p>
+ * NOTE: ICU4J's CharsetDetector and required classes were
+ * copied from ICU4J with two modifications:
+ * Apache Tika added the EBCDIC-500 family of detectors, and
+ * we increased the buffer to 12000 bytes.
+ */
public class Icu4jEncodingDetector implements EncodingDetector {
public Charset detect(InputStream input, Metadata metadata)