You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/29 15:44:12 UTC
[tika] branch main updated: TIKA-3374 -- fix up to encoding
detection in package parser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fbac00b TIKA-3374 -- fix up to encoding detection in package parser
fbac00b is described below
commit fbac00b1dbe0464a7de379e6edb843973b917c6e
Author: tallison <ta...@apache.org>
AuthorDate: Thu Apr 29 11:43:24 2021 -0400
TIKA-3374 -- fix up to encoding detection in package parser
---
CHANGES.txt | 2 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 12 ++++++-
.../apache/tika/parser/pkg/PackageParserTest.java | 38 ++-------------------
.../tika/parser/pkg => test-documents}/gbk.zip | Bin
.../tika/config/TikaEncodingDetectorTest.java | 6 ++--
.../apache/tika/parser/pkg/PackageParserTest.java | 34 ++++++++++++++++++
6 files changed, 51 insertions(+), 41 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 0524ecf..1746fd7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -45,7 +45,7 @@ Release 2.0.0-ALPHA - 01/13/2021
Release 1.27 - ??
-
+ * Apply encoding detection to zip entry names via Ryan421 (TIKA-3374).
* Add json output for /tika endpoint in tika-server (TIKA-3352).
* Tika's OpenNLPDetector now covers 148 languages and language-script pairs (TIKA-3340).
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 5c30183..38dde90 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -59,6 +59,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
+import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -209,6 +210,14 @@ public class PackageParser extends AbstractEncodingDetectorParser {
return entrydata;
}
+ public PackageParser() {
+ super();
+ }
+
+ public PackageParser(EncodingDetector encodingDetector) {
+ super(encodingDetector);
+ }
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -400,8 +409,9 @@ public class PackageParser extends AbstractEncodingDetectorParser {
Charset candidate =
getEncodingDetector().detect(new ByteArrayInputStream(((ZipArchiveEntry) entry).getRawName()),
parentMetadata);
- if (candidate != null)
+ if (candidate != null) {
name = new String(((ZipArchiveEntry) entry).getRawName(), candidate);
+ }
}
if (archive.canReadEntryData(entry)) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index dc44391..d2ef25e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -19,34 +19,24 @@ package org.apache.tika.parser.pkg;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.hamcrest.CoreMatchers;
import org.junit.Test;
-import org.xml.sax.SAXException;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.zip.PackageConstants;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-public class PackageParserTest {
+public class PackageParserTest extends TikaTest {
@Test
public void testCoverage() throws Exception {
@@ -92,28 +82,4 @@ public class PackageParserTest {
}
assertEquals(currentSpecializations.size(), PackageParser.PACKAGE_SPECIALIZATIONS.size());
}
-
- @Test
- public void hanldeNonUnicodeEntryName() throws IOException {
-
- BodyContentHandler handler = new BodyContentHandler();
- EncodingDetector dummyDetector = new EncodingDetector() {
- @Override
- public Charset detect(InputStream inputStream, Metadata metadata) throws IOException {
- return Charset.forName("GB18030");
- }
- };
- PackageParser parser = new PackageParser();
- parser.setEncodingDetector(dummyDetector);
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
- Metadata meta = new Metadata();
- try {
- parser.parse(this.getClass().getResourceAsStream("gbk.zip"), handler, meta, context);
- String res = handler.toString();
- assertThat(res,
- CoreMatchers.containsString("审计压缩包文件检索测试/集团邮件审计系统2021年自动巡检需求文档_V4.0.doc"));
- } catch (SAXException | TikaException ignored) {
- }
- }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/gbk.zip b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/test-documents/gbk.zip
similarity index 100%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/gbk.zip
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/test/resources/test-documents/gbk.zip
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 2b5d079..f1d884b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -141,7 +141,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
- assertEquals(4, parsers.size());
+ assertEquals(5, parsers.size());
EncodingDetector encodingDetector =
((AbstractEncodingDetectorParser) parsers.get(0)).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
@@ -173,7 +173,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
- assertEquals(5, parsers.size());
+ assertEquals(6, parsers.size());
for (Parser encodingDetectingParser : parsers) {
EncodingDetector encodingDetector =
@@ -205,7 +205,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
- assertEquals(4, parsers.size());
+ assertEquals(5, parsers.size());
for (Parser childParser : parsers) {
EncodingDetector encodingDetector =
((AbstractEncodingDetectorParser) childParser).getEncodingDetector();
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
new file mode 100644
index 0000000..0521397
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class PackageParserTest extends TikaTest {
+
+ @Test
+ public void handleNonUnicodeEntryName() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("gbk.zip");
+ assertContains("审计压缩", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
+}