You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/12 17:34:11 UTC
(tika) branch main updated: TIKA-4196 -- add a bom EncodingDetector (#1590)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7c758c31e TIKA-4196 -- add a bom EncodingDetector (#1590)
7c758c31e is described below
commit 7c758c31e6e3f52b4c5f8ad2ac8169dc0f8b310a
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Feb 12 12:34:06 2024 -0500
TIKA-4196 -- add a bom EncodingDetector (#1590)
---
.../org/apache/tika/parser/txt/BOMDetector.java | 93 ++++++++++++++++++++++
.../apache/tika/parser/txt/BOMDetectorTest.java | 91 +++++++++++++++++++++
2 files changed, 184 insertions(+)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java
new file mode 100644
index 000000000..c96bfda5d
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/BOMDetector.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.IOUtils;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+public class BOMDetector implements EncodingDetector {
+
+ private static final ByteOrderMark[] BOMS =
+ //order matters -- have to try the 32 before the 16
+ new ByteOrderMark[] {
+ ByteOrderMark.UTF_8,
+ ByteOrderMark.UTF_32BE,
+ ByteOrderMark.UTF_32LE,
+ ByteOrderMark.UTF_16BE,
+ ByteOrderMark.UTF_16LE
+ };
+ private static final Charset[] CHARSETS = new Charset[BOMS.length];
+
+ private static final int MIN_BYTES = 2;
+ private static final int MAX_BYTES = 4;
+
+ static {
+ for (int i = 0; i < BOMS.length; i++) {
+ try {
+ CHARSETS[i] = Charset.forName(BOMS[i].getCharsetName());
+ } catch (UnsupportedCharsetException e) {
+ //log it
+ }
+ }
+ }
+ @Override
+ public Charset detect(InputStream input, Metadata metadata) throws IOException {
+ input.mark(MAX_BYTES);
+ byte[] bytes = new byte[MAX_BYTES];
+ try {
+ int numRead = IOUtils.read(input, bytes);
+ if (numRead < MIN_BYTES) {
+ return null;
+ } else if (numRead < MAX_BYTES) {
+ //s
+ byte[] tmpBytes = new byte[numRead];
+ System.arraycopy(bytes, 0, tmpBytes, 0, numRead);
+ bytes = tmpBytes;
+ }
+ } finally {
+ input.reset();
+ }
+ for (int i = 0; i < BOMS.length; i++) {
+ ByteOrderMark bom = BOMS[i];
+ if (startsWith(bom, bytes)) {
+ return CHARSETS[i];
+ }
+ }
+ return null;
+ }
+
+ private boolean startsWith(ByteOrderMark bom, byte[] bytes) {
+ byte[] bomBytes = bom.getBytes();
+ if (bytes.length < bomBytes.length) {
+ return false;
+ }
+ for (int i = 0; i < bomBytes.length; i++) {
+ if (bomBytes[i] != bytes[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java
new file mode 100644
index 000000000..b008607dc
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/BOMDetectorTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.BOMInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+public class BOMDetectorTest extends TikaTest {
+ @Test
+ public void testBasic() throws Exception {
+ EncodingDetector detector = new BOMDetector();
+ for (ByteOrderMark bom : new ByteOrderMark[]{
+ ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
+ ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE
+ }) {
+ UnsynchronizedByteArrayOutputStream bos = createStream(bom);
+ try (BOMInputStream bomInputStream =
+ new BOMInputStream(new UnsynchronizedByteArrayInputStream(bos.toByteArray()),
+ ByteOrderMark.UTF_8, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE,
+ ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE)) {
+ assertEquals(bom, bomInputStream.getBOM());
+ }
+ try (UnsynchronizedByteArrayInputStream is =
+ new UnsynchronizedByteArrayInputStream(bos.toByteArray())) {
+ assertEquals(Charset.forName(bom.getCharsetName()), detector.detect(is, new Metadata()));
+ int cnt = 0;
+ int c = is.read();
+ while (c > -1) {
+ cnt++;
+ c = is.read();
+ }
+ assertEquals(100 + bom.getBytes().length, cnt);
+ }
+ }
+ }
+
+ @Test
+ public void testShort() throws Exception {
+ EncodingDetector detector = new BOMDetector();
+ for (ByteOrderMark bom : new ByteOrderMark[] {
+ ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
+ ByteOrderMark.UTF_32LE
+ }) {
+ byte[] bytes = new byte[3];
+ System.arraycopy(bom.getBytes(), 0, bytes, 0, 1);
+ bytes[1] = (byte)32;
+ bytes[2] = (byte)32;
+ try (InputStream is = new UnsynchronizedByteArrayInputStream(bytes)) {
+ assertNull(detector.detect(is, new Metadata()));
+ }
+ }
+ }
+
+ private UnsynchronizedByteArrayOutputStream createStream(ByteOrderMark bom) throws IOException {
+ UnsynchronizedByteArrayOutputStream bos = new UnsynchronizedByteArrayOutputStream();
+ IOUtils.write(bom.getBytes(), bos);
+ for (int i = 0; i < 100; i++) {
+ bos.write(' ');
+ }
+ return bos;
+ }
+}