You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/11 18:15:13 UTC

[tika] branch master updated: TIKA-1568 -- statically cache encoding detector in AutoDetectReader when default initializer is used.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 7175b90  TIKA-1568 -- statically cache encoding detector in AutoDetectReader when default initializer is used.
7175b90 is described below

commit 7175b9027fb4c8566822c57b428363bc35893d74
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jul 11 13:51:19 2019 -0400

    TIKA-1568 -- statically cache encoding detector in AutoDetectReader
    when default initializer is used.
---
 .../org/apache/tika/detect/AutoDetectReader.java   |  11 ++-
 .../tika/parser/AutoDetectReaderParserTest.java    | 102 +++++++++++++++++++++
 2 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index 1945a52..44dce8e 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
 import java.util.Collections;
 import java.util.List;
 
@@ -44,6 +45,13 @@ public class AutoDetectReader extends BufferedReader {
     private static final ServiceLoader DEFAULT_LOADER =
             new ServiceLoader(AutoDetectReader.class.getClassLoader());
 
+    private static EncodingDetector DEFAULT_DETECTOR;
+
+    static {
+        DEFAULT_DETECTOR = new CompositeEncodingDetector(
+                DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
+    }
+
     private static Charset detect(
             InputStream input, Metadata metadata,
             List<EncodingDetector> detectors, LoadErrorHandler handler)
@@ -125,14 +133,13 @@ public class AutoDetectReader extends BufferedReader {
 
     public AutoDetectReader(InputStream stream, Metadata metadata)
             throws IOException, TikaException {
-        this(stream, metadata, DEFAULT_LOADER);
+        this(stream, metadata, DEFAULT_DETECTOR);
     }
 
     public AutoDetectReader(InputStream stream)
             throws IOException, TikaException {
         this(stream, new Metadata());
     }
-
     private static InputStream getBuffered(InputStream stream) {
         if (stream.markSupported()) {
             return stream;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectReaderParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectReaderParserTest.java
new file mode 100644
index 0000000..a6695ab
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectReaderParserTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.MultiThreadedTikaTest;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AutoDetectReaderParserTest extends MultiThreadedTikaTest {
+
+
+    @Test
+    public void testMulti() throws Exception {
+        Parser p = new AutoDetectingReaderParser();
+        int numThreads = 10;
+        int numIterations = 10;
+        ParseContext[] contexts = new ParseContext[numThreads];
+        for (int i = 0; i < numThreads; i++) {
+            contexts[i] = new ParseContext();
+        }
+        FileFilter fileFilter = new FileFilter() {
+            @Override
+            public boolean accept(File pathname) {
+                if (pathname.getName().endsWith(".txt") ||
+                pathname.getName().endsWith(".html")) {
+                    return true;
+                }
+                return false;
+            }
+        };
+        testMultiThreaded(p, contexts, numThreads, numIterations, fileFilter);
+    }
+
+    //this class mimics creating a new AutoDetectReader w/o supplying
+    //a detector.
+    public static class AutoDetectingReaderParser implements Parser {
+
+        @Override
+        public Set<MediaType> getSupportedTypes(ParseContext context) {
+            return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                            MediaType.text("html"),
+                            MediaType.text("plain"))));
+        }
+
+        @Override
+        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+            try (AutoDetectReader reader = new AutoDetectReader(stream)) {
+                Charset charset = reader.getCharset();
+                MediaType type = new MediaType(
+                        MediaType.parse("text/plhtml"), charset);
+                metadata.set(Metadata.CONTENT_TYPE, type.toString());
+                XHTMLContentHandler xhtml =
+                        new XHTMLContentHandler(handler, metadata);
+                xhtml.startDocument();
+
+                xhtml.startElement("p");
+                char[] buffer = new char[4096];
+                int n = reader.read(buffer);
+                while (n != -1) {
+                    xhtml.characters(buffer, 0, n);
+                    n = reader.read(buffer);
+                }
+                xhtml.endElement("p");
+
+                xhtml.endDocument();
+
+            }
+        }
+
+
+    }
+}