You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/11 18:15:13 UTC
[tika] branch master updated: TIKA-1568 -- statically cache
encoding detector in AutoDetectReader when default initializer is used.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 7175b90 TIKA-1568 -- statically cache encoding detector in AutoDetectReader when default initializer is used.
7175b90 is described below
commit 7175b9027fb4c8566822c57b428363bc35893d74
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jul 11 13:51:19 2019 -0400
TIKA-1568 -- statically cache encoding detector in AutoDetectReader
when default initializer is used.
---
.../org/apache/tika/detect/AutoDetectReader.java | 11 ++-
.../tika/parser/AutoDetectReaderParserTest.java | 102 +++++++++++++++++++++
2 files changed, 111 insertions(+), 2 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index 1945a52..44dce8e 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
import java.util.Collections;
import java.util.List;
@@ -44,6 +45,13 @@ public class AutoDetectReader extends BufferedReader {
private static final ServiceLoader DEFAULT_LOADER =
new ServiceLoader(AutoDetectReader.class.getClassLoader());
+ private static EncodingDetector DEFAULT_DETECTOR;
+
+ static {
+ DEFAULT_DETECTOR = new CompositeEncodingDetector(
+ DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
+ }
+
private static Charset detect(
InputStream input, Metadata metadata,
List<EncodingDetector> detectors, LoadErrorHandler handler)
@@ -125,14 +133,13 @@ public class AutoDetectReader extends BufferedReader {
public AutoDetectReader(InputStream stream, Metadata metadata)
throws IOException, TikaException {
- this(stream, metadata, DEFAULT_LOADER);
+ this(stream, metadata, DEFAULT_DETECTOR);
}
public AutoDetectReader(InputStream stream)
throws IOException, TikaException {
this(stream, new Metadata());
}
-
private static InputStream getBuffered(InputStream stream) {
if (stream.markSupported()) {
return stream;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectReaderParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectReaderParserTest.java
new file mode 100644
index 0000000..a6695ab
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectReaderParserTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.MultiThreadedTikaTest;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AutoDetectReaderParserTest extends MultiThreadedTikaTest {
+
+
+ @Test
+ public void testMulti() throws Exception {
+ Parser p = new AutoDetectingReaderParser();
+ int numThreads = 10;
+ int numIterations = 10;
+ ParseContext[] contexts = new ParseContext[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ contexts[i] = new ParseContext();
+ }
+ FileFilter fileFilter = new FileFilter() {
+ @Override
+ public boolean accept(File pathname) {
+ if (pathname.getName().endsWith(".txt") ||
+ pathname.getName().endsWith(".html")) {
+ return true;
+ }
+ return false;
+ }
+ };
+ testMultiThreaded(p, contexts, numThreads, numIterations, fileFilter);
+ }
+
+ //this class mimics creating a new AutoDetectReader w/o supplying
+ //a detector.
+ public static class AutoDetectingReaderParser implements Parser {
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+ MediaType.text("html"),
+ MediaType.text("plain"))));
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ try (AutoDetectReader reader = new AutoDetectReader(stream)) {
+ Charset charset = reader.getCharset();
+ MediaType type = new MediaType(
+ MediaType.parse("text/plhtml"), charset);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ xhtml.startElement("p");
+ char[] buffer = new char[4096];
+ int n = reader.read(buffer);
+ while (n != -1) {
+ xhtml.characters(buffer, 0, n);
+ n = reader.read(buffer);
+ }
+ xhtml.endElement("p");
+
+ xhtml.endDocument();
+
+ }
+ }
+
+
+ }
+}