You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/17 20:54:22 UTC
[tika] branch TIKA-3695 updated: TIKA-3695 -- implement filterExisting in StandardWriteFilter. Create standalone unit test.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3695
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3695 by this push:
new 279ad33 TIKA-3695 -- implement filterExisting in StandardWriteFilter. Create standalone unit test.
279ad33 is described below
commit 279ad33908728fa584ff31e529053b617e2e9837
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 17 16:54:08 2022 -0400
TIKA-3695 -- implement filterExisting in StandardWriteFilter. Create standalone unit test.
---
CHANGES.txt | 3 +
.../java/org/apache/tika/metadata/Metadata.java | 4 +-
.../apache/tika/metadata/StandardWriteFilter.java | 40 +++++++-
.../org/apache/tika/parser/AutoDetectParser.java | 2 +-
.../org/apache/tika/config/TikaConfigTest.java | 69 --------------
.../tika/metadata/MetadataWriteFilterTest.java | 106 +++++++++++++++++++++
6 files changed, 152 insertions(+), 72 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a78be3e..eea54b3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,9 @@ Release 2.4.0 - ???
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
for the dependencies that must be provided at run-time (TIKA-3676).
+ * Add MetadataWriteFilter capability to improve memory profile in
+ Metadata objects (TIKA-3695).
+
* Add detection for Frictionless Data packages and WACZ (TIKA-3696).
* Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 6d1bc0b..da71504 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -60,6 +60,7 @@ public class Metadata
*/
private Map<String, String[]> metadata = null;
+ //TODO: transient?
private MetadataWriteFilter writeFilter = ACCEPT_ALL;
/**
* Constructs a new, empty metadata.
@@ -144,8 +145,9 @@ public class Metadata
* will not function properly.
*
* @param writeFilter
+ * @since 2.4.0
*/
- public void setWriteFilter(MetadataWriteFilter writeFilter) {
+ public void setMetadataWriteFilter(MetadataWriteFilter writeFilter) {
this.writeFilter = writeFilter;
this.writeFilter.filterExisting(metadata);
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
index 8bc47b6..4a26ebb 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
@@ -22,7 +22,9 @@ import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -53,9 +55,16 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
static {
ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_LENGTH);
ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_TYPE);
+ ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_ENCODING);
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE.getName());
ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_HINT.getName());
ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.TIKA_CONTENT.getName());
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.RESOURCE_NAME_KEY);
+ ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_CONTENT.getName());
+ ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY.getName());
+ ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_DISPOSITION);
+ //Metadata.CONTENT_LOCATION? used by the html parser
}
private final boolean includeEmpty;
@@ -86,7 +95,36 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
@Override
public void filterExisting(Map<String, String[]> data) {
+ //this is somewhat costly, but it ensures that
+ //metadata that was placed in the metadata object before this
+ //filter was applied is removed.
+ //It should only be called once, and probably not on that
+ //many fields.
+ Set<String> toRemove = new HashSet<>();
+ for (String n : data.keySet()) {
+ if (! includeField(n)) {
+ toRemove.add(n);
+ }
+ }
+
+ for (String n : toRemove) {
+ data.remove(n);
+ }
+ for (String n : data.keySet()) {
+ String[] vals = data.get(n);
+ List<String> filteredVals = new ArrayList<>();
+ for (int i = 0; i < vals.length; i++) {
+ String v = vals[i];
+ if (include(n, v)) {
+ String filtered = filter(n, v, data);
+ if (filtered != null) {
+ filteredVals.add(filtered);
+ }
+ }
+ }
+ data.put(n, filteredVals.toArray(new String[0]));
+ }
}
@Override
@@ -122,7 +160,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
ByteBuffer bb = ByteBuffer.wrap(bytes, 0, available);
CharBuffer cb = CharBuffer.allocate(available);
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
- // Ignore an incomplete character
+ // Ignore last (potentially) incomplete character
decoder.onMalformedInput(CodingErrorAction.IGNORE);
decoder.decode(bb, cb, true);
decoder.flush(cb);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 7b4157c..7650b23 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -130,7 +130,7 @@ public class AutoDetectParser extends CompositeParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) {
- metadata.setWriteFilter(autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
+ metadata.setMetadataWriteFilter(autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
}
TemporaryResources tmp = new TemporaryResources();
try {
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index da643f8..2f213ff 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -23,11 +23,9 @@ import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
-import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URI;
import java.net.URL;
-import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
@@ -40,9 +38,6 @@ import org.apache.tika.ResourceLoggingClassLoader;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.MetadataWriteFilterFactory;
-import org.apache.tika.metadata.StandardWriteFilterFactory;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeDetectionTest;
import org.apache.tika.parser.AutoDetectParser;
@@ -51,7 +46,6 @@ import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ErrorParser;
-import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.mock.MockParser;
@@ -380,7 +374,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
});
}
-
@Test
public void testTimesInitiated() throws Exception {
//this prevents multi-threading tests, but we aren't doing that now...
@@ -401,66 +394,4 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
assertNull(config.getMaximumDepth());
assertNull(config.getMaximumPackageEntryDepth());
}
-
- @Test
- public void testMetadataFactoryConfig() throws Exception {
- TikaConfig tikaConfig =
- new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
- AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
- MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
- assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
- AutoDetectParser parser = new AutoDetectParser(tikaConfig);
- String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
- "<mock>";
- for (int i = 0; i < 20; i++) {
- mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
- }
- mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
- mock += "</mock>";
- Metadata metadata = new Metadata();
- List<Metadata> metadataList =
- getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
- parser, metadata, new ParseContext(), true);
- assertEquals(1, metadataList.size());
- metadata = metadataList.get(0);
-
- String[] creators = metadata.getValues("dc:creator");
- assertEquals(9, creators.length);
- assertEquals("0123456", creators[8]);
- assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
- assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
- }
-
- @Test
- public void testMetadataFactoryFieldsConfig() throws Exception {
- TikaConfig tikaConfig =
- new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
- AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
- MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
- assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
- AutoDetectParser parser = new AutoDetectParser(tikaConfig);
- String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
- "<mock>";
- mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
- mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
- for (int i = 0; i < 20; i++) {
- mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
- }
- mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
- mock += "</mock>";
- Metadata metadata = new Metadata();
- List<Metadata> metadataList =
- getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
- parser, metadata, new ParseContext(), true);
- assertEquals(1, metadataList.size());
- metadata = metadataList.get(0);
-
- String[] creators = metadata.getValues("dc:creator");
- assertNull(metadata.get("dc:subject"));
- //this gets more than the other test because this is filtering out X-TIKA:Parsed-By"
- assertEquals(12, creators.length);
- assertEquals("012345", creators[11]);
- assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
- assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
- }
}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
new file mode 100644
index 0000000..54175ac
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigTest;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.AutoDetectParserConfig;
+import org.apache.tika.parser.ParseContext;
+
+public class MetadataWriteFilterTest extends TikaTest {
+
+
+ @Test
+ public void testMetadataFactoryConfig() throws Exception {
+ TikaConfig tikaConfig =
+ new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
+ AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+ MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+ assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+ "<mock>";
+ for (int i = 0; i < 20; i++) {
+ mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+ }
+ mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+ mock += "</mock>";
+ Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
+ getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser, metadata, new ParseContext(), true);
+ assertEquals(1, metadataList.size());
+ metadata = metadataList.get(0);
+
+ String[] creators = metadata.getValues("dc:creator");
+ assertEquals(9, creators.length);
+ assertEquals("0123456", creators[8]);
+ assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+ assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+ }
+
+ @Test
+ public void testMetadataFactoryFieldsConfig() throws Exception {
+ TikaConfig tikaConfig =
+ new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
+ AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+ MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+ assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+ "<mock>";
+ mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
+ mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
+ for (int i = 0; i < 20; i++) {
+ mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+ }
+ mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+ mock += "</mock>";
+ Metadata metadata = new Metadata();
+ metadata.add("dc:creator", "abcdefghijabcdefghij");
+ metadata.add("not-allowed", "not-allowed");
+ List<Metadata> metadataList =
+ getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser, metadata, new ParseContext(), true);
+ assertEquals(1, metadataList.size());
+ metadata = metadataList.get(0);
+ //test that this was removed during the filter existing stage
+ assertNull(metadata.get("not-allowed"));
+ //test that this was not allowed because it isn't in the "include" list
+ assertNull(metadata.get("dc:subject"));
+
+ String[] creators = metadata.getValues("dc:creator");
+ assertEquals("abcdefghijabcdefghij", creators[0]);
+
+ //this gets more than the other test because this is filtering out X-TIKA:Parsed-By", etc.
+ assertEquals(12, creators.length);
+ assertEquals("012345", creators[11]);
+ assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+ assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+ }
+}