You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/06/21 20:19:48 UTC

[tika] branch branch_1x updated (9a56aa4 -> c6f7b45)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 9a56aa4  TIKA-2660 -- enable building with Java 10 -- revert tika-dl until full fix is available.
     new b4cdfcf  TIKA-2677 -- fix multithreaded updating/access to MediaTypeRegistry, via Yuriy Koval
     new df9ed82  TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar
     new c6f7b45  TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar -- fix illegal getBytes()...mea culpa...

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/tika/mime/MediaTypeRegistry.java    |   5 +-
 .../main/java/org/apache/tika/mime/MimeTypes.java  |   5 +-
 .../org/apache/tika/mime/MimeTypesReaderTest.java  |  22 ++++
 .../tika/parser/html/HtmlEncodingDetector.java     |   8 +-
 .../tika/parser/html/HtmlEncodingDetectorTest.java | 142 +++++++++++++++++++++
 5 files changed, 175 insertions(+), 7 deletions(-)
 create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java


[tika] 02/03: TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit df9ed8260c91800baa202a748b0ff3854937ff5f
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Jun 21 14:49:20 2018 -0400

    TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar
---
 .../tika/parser/html/HtmlEncodingDetector.java     |   8 +-
 .../tika/parser/html/HtmlEncodingDetectorTest.java | 142 +++++++++++++++++++++
 2 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index 84141b9..c4c5188 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -83,7 +83,7 @@ public class HtmlEncodingDetector implements EncodingDetector {
 
 
     private static final Pattern HTTP_META_PATTERN = Pattern.compile(
-            "(?is)<\\s*meta\\s+([^<>]+)"
+            "(?is)<\\s*meta(?:/|\\s+)([^<>]+)"
     );
 
     //this should match both the older:
@@ -97,7 +97,7 @@ public class HtmlEncodingDetector implements EncodingDetector {
     //For a more general "not" matcher, try:
     //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
     private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
-            ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+            ("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
     );
 
     private static final Charset ASCII = Charset.forName("US-ASCII");
@@ -154,6 +154,10 @@ public class HtmlEncodingDetector implements EncodingDetector {
                 if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
                     continue;
                 }
+                if ("x-user-defined".equalsIgnoreCase(candCharset)) {
+                    candCharset = "windows-1252";
+                }
+
                 if (CharsetUtils.isSupported(candCharset)) {
                     try {
                         return CharsetUtils.forName(candCharset);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..40ab01c
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.*;
+
+public class HtmlEncodingDetectorTest {
+
+    @Test
+    public void basic() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    @Ignore("can we can prove this harms detection")
+    public void utf16() throws IOException {
+        // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+        assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+    }
+
+    @Test
+    public void xUserDefined() throws IOException {
+        // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+        assertWindows1252("<meta charset='x-user-defined'>");
+    }
+
+    @Test
+    public void withSlash() throws IOException {
+        assertWindows1252("<meta/charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void insideTag() throws IOException {
+        assertWindows1252("<meta name='description'" +
+                "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void missingAttribute() throws IOException {
+        assertWindows1252(
+                "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+                        "<meta charset='WINDOWS-1252'>" // valid declaration
+        );
+    }
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void insideSpecialTag() throws IOException {
+        // Content inside <?, <!, and </ should be ignored
+        for (byte b : "?!/".getBytes())
+            assertWindows1252(
+                    "<" + (char) b + // start comment
+                            "<meta charset='UTF-8'>" + // inside special tag
+                            "<meta charset='WINDOWS-1252'>" // real charset declaration
+            );
+    }
+
+    @Test
+    @Ignore("until we can prove this harms detection")
+    public void spaceBeforeTag() throws IOException {
+        assertWindows1252(
+                "< meta charset='UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void invalidAttribute() throws IOException {
+        assertWindows1252(
+                "<meta " +
+                        "badcharset='UTF-8' " + // invalid charset declaration
+                        "charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    @Ignore("until we can prove this harms detection")
+    public void unmatchedQuote() throws IOException {
+        assertWindows1252(
+                "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void withCompactComment() throws IOException {
+        // <!--> is a valid comment
+        assertWindows1252(
+                "<!--" + // start comment
+                        "<meta charset='UTF-8'>" + // inside comment
+                        "-->" + // end comment
+                        "<!-->" + // compact comment
+                        "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+        );
+    }
+
+    private void assertWindows1252(String html) throws IOException {
+        assertCharset(html, Charset.forName("WINDOWS-1252"));
+    }
+
+    private void assertCharset(String html, Charset charset) throws IOException {
+        assertEquals(html + " should be detected as " + charset,
+                charset, detectCharset(html));
+    }
+
+    private Charset detectCharset(String test) throws IOException {
+        Metadata metadata = new Metadata();
+        InputStream inStream = new ByteArrayInputStream(test.getBytes(StandardCharsets.UTF_8));
+        return new HtmlEncodingDetector().detect(inStream, metadata);
+    }
+}
\ No newline at end of file


[tika] 01/03: TIKA-2677 -- fix multithreaded updating/access to MediaTypeRegistry, via Yuriy Koval

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b4cdfcfe19b008c2e9955f42caf6d52e8389a172
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Jun 20 16:26:12 2018 -0400

    TIKA-2677 -- fix multithreaded updating/access to MediaTypeRegistry, via Yuriy Koval
---
 .../org/apache/tika/mime/MediaTypeRegistry.java    |  5 +++--
 .../main/java/org/apache/tika/mime/MimeTypes.java  |  5 ++---
 .../org/apache/tika/mime/MimeTypesReaderTest.java  | 22 ++++++++++++++++++++++
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
index 813eda0..65938be 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
@@ -21,6 +21,7 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
 
 /**
  * Registry of known Internet media types.
@@ -46,7 +47,7 @@ public class MediaTypeRegistry implements Serializable {
      * as a mapping from the alias to the corresponding canonical type.
      */
     private final Map<MediaType, MediaType> registry =
-        new HashMap<MediaType, MediaType>();
+        new ConcurrentHashMap<>();
 
     /**
      * Known type inheritance relationships. The mapping is from a media type
@@ -74,7 +75,7 @@ public class MediaTypeRegistry implements Serializable {
      * @return known aliases
      */
     public SortedSet<MediaType> getAliases(MediaType type) {
-        SortedSet<MediaType> aliases = new TreeSet<MediaType>();
+        SortedSet<MediaType> aliases = new TreeSet<>();
         for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) {
             if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
                 aliases.add(entry.getKey());
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 46b9d8f..705ad3d 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
@@ -30,6 +30,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import javax.xml.namespace.QName;
 
@@ -102,8 +103,7 @@ public final class MimeTypes implements Detector, Serializable {
     private final MediaTypeRegistry registry = new MediaTypeRegistry();
 
     /** All the registered MimeTypes indexed on their canonical names */
-    private final Map<MediaType, MimeType> types =
-        new HashMap<MediaType, MimeType>();
+    private final Map<MediaType, MimeType> types = new HashMap<>();
 
     /** The patterns matcher */
     private Patterns patterns = new Patterns(registry);
@@ -425,7 +425,6 @@ public final class MimeTypes implements Detector, Serializable {
      *
      * @return the minimum length of data to provide.
      * @see #getMimeType(byte[])
-     * @see #getMimeType(String, byte[])
      */
     public int getMinLength() {
         // This needs to be reasonably large to be able to correctly detect
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
index 8782167..0c1f494 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
@@ -25,8 +25,10 @@ import static org.junit.Assert.fail;
 import java.io.ByteArrayInputStream;
 import java.lang.reflect.Field;
 import java.util.ArrayList;
+import java.util.ConcurrentModificationException;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.Executors;
 
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
@@ -48,6 +50,8 @@ import org.junit.Test;
  */
 public class MimeTypesReaderTest {
 
+    static boolean stop = false;
+
     private MimeTypes mimeTypes;
     private List<Magic> magics;
 
@@ -279,4 +283,22 @@ public class MimeTypesReaderTest {
         assertEquals(name, mimeType.toString());
         assertEquals(".ditamap", mimeType.getExtension());
     }
+
+    @Test
+    public void testMultiThreaded() throws Exception {
+        MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes();
+        Executors.newSingleThreadExecutor().execute(()-> {
+            try {
+                for (int i = 0; i < 500 && !stop; i++) {
+                    mimeTypes.forName("abc"+i+"/abc");
+                }
+            } catch (MimeTypeException e ) {
+                e.printStackTrace();
+            }}
+        );
+
+        for (int i = 0; i < 500 & !stop; i++) {
+            mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP);
+        }
+    }
 }


[tika] 03/03: TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar -- fix illegal getBytes()...mea culpa...

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c6f7b45ae6ace89ee2398f251c97dd23d220355b
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Jun 21 16:18:51 2018 -0400

    TIKA-2673 -- unit tests for stricter adherence to spec via Gerard Bouchar -- fix illegal getBytes()...mea culpa...
---
 .../test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
index 40ab01c..931f5e1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
@@ -76,7 +76,7 @@ public class HtmlEncodingDetectorTest {
     @Ignore("until we do a full parse")
     public void insideSpecialTag() throws IOException {
         // Content inside <?, <!, and </ should be ignored
-        for (byte b : "?!/".getBytes())
+        for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
             assertWindows1252(
                     "<" + (char) b + // start comment
                             "<meta charset='UTF-8'>" + // inside special tag