You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/15 17:57:41 UTC
[tika] branch branch_2x updated: TIKA-4133 -- add a capture group metadatafilter (#1346)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new 490038984 TIKA-4133 -- add a capture group metadatafilter (#1346)
new 688f9d225 Merge remote-tracking branch 'origin/branch_2x' into branch_2x
490038984 is described below
commit 4900389846fa65006392473e611b2afe3efb5acb
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Sep 15 13:56:46 2023 -0400
TIKA-4133 -- add a capture group metadatafilter (#1346)
* TIKA-4133 -- add a capture group metadata filter
---
.../filter/CaptureGroupMetadataFilter.java | 110 +++++++++++++++++++++
.../tika/metadata/filter/TestMetadataFilter.java | 53 ++++++++++
.../config/TIKA-4133-capture-group-overwrite.xml | 26 +++++
.../apache/tika/config/TIKA-4133-capture-group.xml | 26 +++++
4 files changed, 215 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
new file mode 100644
index 000000000..ca9b1e6ea
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.StringUtils;
+
+
+/**
+ * This filter runs a regex against the first value in the "sourceField".
+ * If the pattern matches, it extracts the first group of the first match and
+ * set's the "targetField"'s value to that first group.
+ * <p/>
+ * If there is a match, this will overwrite whatever value is in the
+ * "targetField".
+ * <p/>
+ * If there is not a match, this filter will be a no-op.
+ * <p/>
+ * If there are multiple matches, this filter will capture only the first.
+ * Open a ticket if you need different behavior.
+ * <p/>
+ * If the source field has multiple values, this will run the regex
+ * against only the first value.
+ * <p/>
+ * If the source field does not exist, this filter will be a no-op.
+ * <p/>
+ * If the target field is the same value as the source field, this filter
+ * will overwrite the value in that field. Again, if there are multiple
+ * values in that field, those will all be overwritten.
+ */
+public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable {
+
+ private String regexString;
+ private Pattern regex;
+ private String sourceField;
+ private String targetField;
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ String val = metadata.get(sourceField);
+ if (StringUtils.isBlank(val)) {
+ return;
+ }
+ Matcher m = regex.matcher(val);
+ if (m.find()) {
+ metadata.set(targetField, m.group(1));
+ }
+ }
+
+ @Field
+ public void setRegex(String regex) {
+ this.regexString = regex;
+ }
+
+ @Field
+ public void setSourceField(String sourceField) {
+ this.sourceField = sourceField;
+ }
+
+ @Field
+ public void setTargetField(String targetField) {
+ this.targetField = targetField;
+ }
+
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+ try {
+ regex = Pattern.compile(regexString);
+ } catch (PatternSyntaxException e) {
+ throw new TikaConfigException("Couldn't parse regex", e);
+ }
+
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ if (StringUtils.isBlank(sourceField)) {
+ throw new TikaConfigException("Must specify a 'sourceField'");
+ }
+ if (StringUtils.isBlank(targetField)) {
+ throw new TikaConfigException("Must specify a 'targetField'");
+ }
+ }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 88d510d57..0b071d0be 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -191,4 +191,57 @@ public class TestMetadataFilter extends AbstractTikaConfigTest {
filter.filter(m);
assertEquals("2021-07-23T08:02:24Z", m.get(TikaCoreProperties.CREATED));
}
+
+ @Test
+ public void testCaptureGroupBasic() throws Exception {
+ TikaConfig config = getConfig("TIKA-4133-capture-group.xml");
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox");
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+
+ MetadataFilter filter = config.getMetadataFilter();
+ filter.filter(metadata);
+ assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("text/html", metadata.get("mime"));
+ }
+
+ @Test
+ public void testCaptureGroupNoSemiColon() throws Exception {
+ TikaConfig config = getConfig("TIKA-4133-capture-group.xml");
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox");
+ metadata.set(Metadata.CONTENT_TYPE, "text/html");
+
+ MetadataFilter filter = config.getMetadataFilter();
+ filter.filter(metadata);
+ assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("text/html", metadata.get("mime"));
+ }
+
+ @Test
+ public void testCaptureGroupOverwrite() throws Exception {
+ TikaConfig config = getConfig("TIKA-4133-capture-group-overwrite.xml");
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox");
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+
+ MetadataFilter filter = config.getMetadataFilter();
+ filter.filter(metadata);
+ assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
+
+ // now test that a single match overwrites all the values
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+ metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/html; charset=UTF-8");
+ metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/plain; charset=UTF-8");
+ metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "application/pdf; charset=UTF-8");
+
+ filter.filter(metadata);
+ assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
+ assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml
new file mode 100644
index 000000000..b43655840
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.CaptureGroupMetadataFilter">
+ <sourceField>Content-Type</sourceField>
+ <targetField>Content-Type</targetField>
+ <regex>\A([^;]+)</regex>
+ </metadataFilter>
+ </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml
new file mode 100644
index 000000000..7ad7378e0
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.CaptureGroupMetadataFilter">
+ <sourceField>Content-Type</sourceField>
+ <targetField>mime</targetField>
+ <regex>\A([^;]+)</regex>
+ </metadataFilter>
+ </metadataFilters>
+</properties>