You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2021/08/17 15:20:57 UTC

[tika] branch main updated (faf7d9d -> 11f807f)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from faf7d9d  clean up dependencies and update CHANGES.txt
     new 7077e9b  TIKA-3527 -- add a simple UrlFetcher
     new 11f807f  TIKA-3516 -- strip _rtl _ltr in CharsetMatch and enable turning off specific encodings.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  4 ++
 .../apache/tika/pipes/fetcher/url/UrlFetcher.java  | 52 ++++++++++++++++++++++
 .../org/apache/tika/parser/txt/CharsetMatch.java   | 32 ++++++++-----
 .../tika/parser/txt/Icu4jEncodingDetector.java     | 16 ++++++-
 .../tika/parser/txt/CharsetDetectorTest.java       | 17 +++++++
 .../test-configs/tika-config-ignore-charset.xml}   | 14 +++---
 .../resources/test-documents/testIgnoreCharset.txt |  4 ++
 .../resources/config/tika-config-url-fetcher.xml   |  4 +-
 8 files changed, 123 insertions(+), 20 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java
 copy tika-parsers/tika-parsers-standard/{tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-parameterize-encoding-detector.xml => tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml} (76%)
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testIgnoreCharset.txt

[tika] 02/02: TIKA-3516 -- strip _rtl _ltr in CharsetMatch and enable turning off specific encodings.

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 11f807fd61242de9d7f963fde3ae79210b9ad750
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 17 11:20:38 2021 -0400

    TIKA-3516 -- strip _rtl _ltr in CharsetMatch and enable turning off specific encodings.
---
 .../org/apache/tika/parser/txt/CharsetMatch.java   | 32 ++++++++++++++--------
 .../tika/parser/txt/Icu4jEncodingDetector.java     | 16 ++++++++++-
 .../tika/parser/txt/CharsetDetectorTest.java       | 17 ++++++++++++
 .../test-configs/tika-config-ignore-charset.xml    | 32 ++++++++++++++++++++++
 .../resources/test-documents/testIgnoreCharset.txt |  4 +++
 5 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
index b4a5493..faab4a5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
@@ -153,17 +153,7 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
 
             return sb.toString();
         } else {
-            String name = getName();
-            /*
-             * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
-             * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
-             * should be stripped off before creating the string.
-             */
-            int startSuffix =
-                    !name.contains("_rtl") ? name.indexOf("_ltr") : name.indexOf("_rtl");
-            if (startSuffix > 0) {
-                name = name.substring(0, startSuffix);
-            }
+            String name = getNormalizedName();
             result = new String(fRawInput, name);
         }
         return result;
@@ -171,6 +161,26 @@ public class CharsetMatch implements Comparable<CharsetMatch> {
     }
 
     /**
+     * strips e.g. _rtl, _ltr off of charset names so that they can be used as a charset.
+     *
+     * @return
+     */
+    public String getNormalizedName() {
+        String name = getName();
+        /*
+         * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
+         * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or '_ltr'
+         * should be stripped off before creating the string.
+         */
+        int startSuffix =
+                !name.contains("_rtl") ? name.indexOf("_ltr") : name.indexOf("_rtl");
+        if (startSuffix > 0) {
+            name = name.substring(0, startSuffix);
+        }
+        return name;
+    }
+
+    /**
      * Get an indication of the confidence in the charset detected.
      * Confidence values range from 0-100, with larger numbers indicating
      * a better match of the input data to the characteristics of the
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
index 49f2ef9..ce9ee9f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -19,6 +19,9 @@ package org.apache.tika.parser.txt;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
 
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.EncodingDetector;
@@ -34,6 +37,8 @@ public class Icu4jEncodingDetector implements EncodingDetector {
     @Field
     private int markLimit = CharsetDetector.DEFAULT_MARK_LIMIT;
 
+    private Set<String> ignoreCharsets = new HashSet<>();
+
     public Charset detect(InputStream input, Metadata metadata) throws IOException {
         if (input == null) {
             return null;
@@ -68,7 +73,11 @@ public class Icu4jEncodingDetector implements EncodingDetector {
 
         for (CharsetMatch match : detector.detectAll()) {
             try {
-                return CharsetUtils.forName(match.getName());
+                String n = match.getNormalizedName();
+                if (ignoreCharsets.contains(n)) {
+                    return null;
+                }
+                return CharsetUtils.forName(match.getNormalizedName());
             } catch (IllegalArgumentException e) {
                 // ignore
             }
@@ -111,4 +120,9 @@ public class Icu4jEncodingDetector implements EncodingDetector {
     public void setMarkLimit(int markLimit) {
         this.markLimit = markLimit;
     }
+
+    @Field
+    public void setIgnoreCharsets(List<String> charsetsToIgnore) {
+        this.ignoreCharsets.addAll(charsetsToIgnore);
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index 74f9b68..d41e349 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -29,6 +29,10 @@ import java.nio.file.Files;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
 
 public class CharsetDetectorTest extends TikaTest {
 
@@ -125,4 +129,17 @@ public class CharsetDetectorTest extends TikaTest {
         detector.setText(sb.toString().getBytes("UTF-8"));
         assertEquals("UTF-8", detector.detect().getName());
     }
+
+    @Test
+    public void testIgnoreCharset() throws Exception {
+        //TIKA-3516, TIKA-3525, TIKA-1236
+        TikaConfig tikaConfig = new TikaConfig(
+                getResourceAsStream("/test-configs/tika-config-ignore-charset.xml"));
+
+        Metadata m = new Metadata();
+
+        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
+        assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt",
+                new AutoDetectParser(tikaConfig), m).xml);
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
new file mode 100644
index 0000000..0b61f20
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser"/>
+    </parsers>
+    <encodingDetectors>
+        <encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
+            <params>
+                <param name="ignoreCharsets" type="list">
+                    <string>IBM424</string>
+                </param>
+            </params>
+        </encodingDetector>
+        <encodingDetector class="org.apache.tika.parser.txt.UniversalEncodingDetector"/>
+    </encodingDetectors>
+</properties>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testIgnoreCharset.txt b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testIgnoreCharset.txt
new file mode 100644
index 0000000..4673e04
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/testIgnoreCharset.txt
@@ -0,0 +1,4 @@
+
+ACTIVE AGE
+
+BALM

[tika] 01/02: TIKA-3527 -- add a simple UrlFetcher

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7077e9b822adb798efa260f587ab0a2babcfb746
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 17 09:18:53 2021 -0400

    TIKA-3527 -- add a simple UrlFetcher
---
 CHANGES.txt                                        |  4 ++
 .../apache/tika/pipes/fetcher/url/UrlFetcher.java  | 52 ++++++++++++++++++++++
 .../resources/config/tika-config-url-fetcher.xml   |  4 +-
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 4edc510..414253c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,6 +11,10 @@ Release 2.1.0 - ???
    * Change the default rendering strategy for PDFs from NO_TEXT to ALL (TIKA-3520).
 
    Other changes:
+
+   * Add a simple UrlFetcher in tika-core as a basic alternative
+     to tika-fetcher-http (TIKA-3527).
+
    * Add tika-pipes support for Google Cloud Storage (TIKA-3524).
 
    * Fix markup ordering errors in xhtml output for ODT files (TIKA-2242).
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java
new file mode 100644
index 0000000..ec4954f
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetcher.url;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Locale;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+
+/**
+ * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}.
+ * This intentionally does not support fetching for files.
+ * Please use the FileSystemFetcher for that.  If you need more advanced control (passwords,
+ * timeouts, proxies, etc), please use the tika-fetcher-http module.
+ */
+public class UrlFetcher extends AbstractFetcher {
+
+    @Override
+    public InputStream fetch(String fetchKey, Metadata metadata) throws IOException, TikaException {
+        if (fetchKey.contains("\u0000")) {
+            throw new IllegalArgumentException("URL must not contain \u0000. " +
+                    "Please review the life decisions that led you to requesting " +
+                    "a URL with this character in it.");
+        }
+        if (fetchKey.toLowerCase(Locale.US).startsWith("file:")) {
+            throw new IllegalArgumentException(
+                    "The UrlFetcher does not fetch from file shares; " +
+                    "please use the FileSystemFetcher");
+        }
+        return TikaInputStream.get(new URL(fetchKey), metadata);
+    }
+
+}
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-url-fetcher.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-url-fetcher.xml
index d3aaff0..d8a4321 100644
--- a/tika-server/tika-server-standard/src/test/resources/config/tika-config-url-fetcher.xml
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-url-fetcher.xml
@@ -22,9 +22,9 @@
         <parser class="org.apache.tika.parser.DefaultParser"/>
     </parsers>
     <fetchers>
-        <fetcher class="org.apache.tika.pipes.fetcher.SimpleUrlFetcher">
+        <fetcher class="org.apache.tika.pipes.fetcher.url.UrlFetcher">
             <params>
-                <param name="name" type="string">url</param>
+                <name>url</name>
             </params>
         </fetcher>
     </fetchers>