You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/29 15:44:12 UTC

[tika] branch main updated: TIKA-3374 -- fix up to encoding detection in package parser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new fbac00b  TIKA-3374 -- fix up to encoding detection in package parser
fbac00b is described below

commit fbac00b1dbe0464a7de379e6edb843973b917c6e
Author: tallison <ta...@apache.org>
AuthorDate: Thu Apr 29 11:43:24 2021 -0400

    TIKA-3374 -- fix up to encoding detection in package parser
---
 CHANGES.txt                                        |   2 +-
 .../org/apache/tika/parser/pkg/PackageParser.java  |  12 ++++++-
 .../apache/tika/parser/pkg/PackageParserTest.java  |  38 ++-------------------
 .../tika/parser/pkg => test-documents}/gbk.zip     | Bin
 .../tika/config/TikaEncodingDetectorTest.java      |   6 ++--
 .../apache/tika/parser/pkg/PackageParserTest.java  |  34 ++++++++++++++++++
 6 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 0524ecf..1746fd7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -45,7 +45,7 @@ Release 2.0.0-ALPHA - 01/13/2021
 
 
 Release 1.27 - ??
-
+   * Apply encoding detection to zip entry names via Ryan421 (TIKA-3374).
    * Add json output for /tika endpoint in tika-server (TIKA-3352).
    * Tika's OpenNLPDetector now covers 148 languages and language-script pairs (TIKA-3340).
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 5c30183..38dde90 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -59,6 +59,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -209,6 +210,14 @@ public class PackageParser extends AbstractEncodingDetectorParser {
         return entrydata;
     }
 
+    public PackageParser() {
+        super();
+    }
+
+    public PackageParser(EncodingDetector encodingDetector) {
+        super(encodingDetector);
+    }
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -400,8 +409,9 @@ public class PackageParser extends AbstractEncodingDetectorParser {
             Charset candidate =
                     getEncodingDetector().detect(new ByteArrayInputStream(((ZipArchiveEntry) entry).getRawName()),
                         parentMetadata);
-            if (candidate != null)
+            if (candidate != null) {
                 name = new String(((ZipArchiveEntry) entry).getRawName(), candidate);
+            }
         }
         
         if (archive.canReadEntryData(entry)) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index dc44391..d2ef25e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -19,34 +19,24 @@ package org.apache.tika.parser.pkg;
 
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.hamcrest.CoreMatchers;
 import org.junit.Test;
-import org.xml.sax.SAXException;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.zip.PackageConstants;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
 
-public class PackageParserTest {
+public class PackageParserTest extends TikaTest {
 
     @Test
     public void testCoverage() throws Exception {
@@ -92,28 +82,4 @@ public class PackageParserTest {
         }
         assertEquals(currentSpecializations.size(), PackageParser.PACKAGE_SPECIALIZATIONS.size());
     }
-    
-    @Test
-    public void hanldeNonUnicodeEntryName() throws IOException {
-
-        BodyContentHandler handler = new BodyContentHandler();
-        EncodingDetector dummyDetector = new EncodingDetector() {
-            @Override
-            public Charset detect(InputStream inputStream, Metadata metadata) throws IOException {
-                return Charset.forName("GB18030");
-            }
-        };
-        PackageParser parser = new PackageParser();
-        parser.setEncodingDetector(dummyDetector);
-        ParseContext context = new ParseContext();
-        context.set(Parser.class, parser);
-        Metadata meta = new Metadata();
-        try {
-            parser.parse(this.getClass().getResourceAsStream("gbk.zip"), handler, meta, context);
-            String res = handler.toString();
-            assertThat(res,
-                    CoreMatchers.containsString("审计压缩包文件检索测试/集团邮件审计系统2021年自动巡检需求文档_V4.0.doc"));
-        } catch (SAXException | TikaException ignored) {
-        }
-    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/gbk.zip b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/test-documents/gbk.zip
similarity index 100%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/gbk.zip
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/test-documents/gbk.zip
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 2b5d079..f1d884b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -141,7 +141,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
         List<Parser> parsers = new ArrayList<>();
         findEncodingDetectionParsers(p, parsers);
 
-        assertEquals(4, parsers.size());
+        assertEquals(5, parsers.size());
         EncodingDetector encodingDetector =
                 ((AbstractEncodingDetectorParser) parsers.get(0)).getEncodingDetector();
         assertTrue(encodingDetector instanceof CompositeEncodingDetector);
@@ -173,7 +173,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
         List<Parser> parsers = new ArrayList<>();
         findEncodingDetectionParsers(p, parsers);
 
-        assertEquals(5, parsers.size());
+        assertEquals(6, parsers.size());
 
         for (Parser encodingDetectingParser : parsers) {
             EncodingDetector encodingDetector =
@@ -205,7 +205,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
         List<Parser> parsers = new ArrayList<>();
         findEncodingDetectionParsers(p, parsers);
 
-        assertEquals(4, parsers.size());
+        assertEquals(5, parsers.size());
         for (Parser childParser : parsers) {
             EncodingDetector encodingDetector =
                     ((AbstractEncodingDetectorParser) childParser).getEncodingDetector();
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
new file mode 100644
index 0000000..0521397
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class PackageParserTest extends TikaTest {
+
+    @Test
+    public void handleNonUnicodeEntryName() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("gbk.zip");
+        assertContains("审计压缩", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
+}