You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/23 21:22:32 UTC
[tika] branch main updated: TIKA-3496 -- add a metadatafilter to
allow users to ensure that dates emitted to Solr/OpenSearch are all UTC 'Z'
formatted.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 381b36e TIKA-3496 -- add a metadatafilter to allow users to ensure that dates emitted to Solr/OpenSearch are all UTC 'Z' formatted.
381b36e is described below
commit 381b36e8cc9a83fb34f22a74eaf20b86c78b6274
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 23 17:22:02 2021 -0400
TIKA-3496 -- add a metadatafilter to allow users to ensure that dates emitted to Solr/OpenSearch are all UTC 'Z' formatted.
---
CHANGES.txt | 5 ++
.../java/org/apache/tika/metadata/Property.java | 18 ++---
.../filter/DateNormalizingMetadataFilter.java | 90 ++++++++++++++++++++++
.../tika/metadata/filter/TestMetadataFilter.java | 12 +++
.../opensearch/tika-config-opensearch.xml | 5 ++
.../src/test/resources/tika-config-solr-urls.xml | 5 ++
.../pipes/emitter/solr/SolrEmitterDevTest.java | 62 +++++++++++++++
7 files changed, 188 insertions(+), 9 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 1d7818f..fe0ea2a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,10 @@
Release 2.0.1 - ???
+ * Add DateNormalizingMetadataFilter let users ensure that all dates
+ emitted to Solr/OpenSearch are in UTC. Users can configure which
+ timezone they'd like to use in cases where the file format does
+ not store a timezone (TIKA-3496).
+
* Breaking change in the Solr and OpenSearch emitters. To achieve
the SKIP or CONCATENATE attachment strategy, modify the
parseMode in the pipesiterators or in the FetchEmitTuple (TIKA-3494).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Property.java b/tika-core/src/main/java/org/apache/tika/metadata/Property.java
index 40ccb68..3d75ad1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Property.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Property.java
@@ -18,12 +18,12 @@ package org.apache.tika.metadata;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
/**
* XMP property definition. Each instance of this class defines a single
@@ -36,7 +36,7 @@ import java.util.TreeSet;
*/
public final class Property implements Comparable<Property> {
- private static final Map<String, Property> properties = new HashMap<>();
+ private static final Map<String, Property> PROPERTIES = new ConcurrentHashMap<>();
private final String name;
private final boolean internal;
private final PropertyType propertyType;
@@ -70,8 +70,8 @@ public final class Property implements Comparable<Property> {
this.secondaryExtractProperties = null;
// Only store primary properties for lookup, not composites
- synchronized (properties) {
- properties.put(name, this);
+ synchronized (PROPERTIES) {
+ PROPERTIES.put(name, this);
}
}
}
@@ -102,7 +102,7 @@ public final class Property implements Comparable<Property> {
*/
public static PropertyType getPropertyType(String key) {
PropertyType type = null;
- Property prop = properties.get(key);
+ Property prop = PROPERTIES.get(key);
if (prop != null) {
type = prop.getPropertyType();
}
@@ -116,16 +116,16 @@ public final class Property implements Comparable<Property> {
* @return the Property object
*/
public static Property get(String key) {
- return properties.get(key);
+ return PROPERTIES.get(key);
}
public static SortedSet<Property> getProperties(String prefix) {
SortedSet<Property> set = new TreeSet<>();
String p = prefix + ":";
- synchronized (properties) {
- for (String name : properties.keySet()) {
+ synchronized (PROPERTIES) {
+ for (String name : PROPERTIES.keySet()) {
if (name.startsWith(p)) {
- set.add(properties.get(name));
+ set.add(PROPERTIES.get(name));
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
new file mode 100644
index 0000000..e093873
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.ZoneId;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * Some dates in some file formats do not have a timezone.
+ * Tika correctly stores these without a timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss'
+ * This can be a problem if end points expect a 'Z' timezone.
+ * This filter makes the assumption that dates without timezones are UTC
+ * and always modifies the date to: "yyyy-MM-dd'T'HH:mm:ss'Z'"
+ *
+ * Users can specify an alternate defaultTimeZone with
+ * {@link DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply
+ * if the file format does not specify a timezone.
+ *
+ */
+public class DateNormalizingMetadataFilter extends MetadataFilter {
+
+ private static TimeZone UTC = TimeZone.getTimeZone("UTC");
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(DateNormalizingMetadataFilter.class);
+
+ private TimeZone defaultTimeZone = UTC;
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ SimpleDateFormat dateFormatter = null;
+ SimpleDateFormat utcFormatter = null;
+ for (String n : metadata.names()) {
+
+ Property property = Property.get(n);
+ if (property != null) {
+ if (property.getValueType().equals(Property.ValueType.DATE)) {
+ String dateString = metadata.get(property);
+ if (dateString.endsWith("Z")) {
+ continue;
+ }
+ if (dateFormatter == null) {
+ dateFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
+ dateFormatter.setTimeZone(defaultTimeZone);
+ utcFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US);
+ utcFormatter.setTimeZone(UTC);
+ }
+ Date d = null;
+ try {
+ d = dateFormatter.parse(dateString);
+ metadata.set(property, utcFormatter.format(d));
+ } catch (ParseException e) {
+ LOGGER.warn("Couldn't convert date to default time zone: >"
+ + dateString + "<");
+ }
+ }
+ }
+ }
+ }
+
+ @Field
+ public void setDefaultTimeZone(String timeZoneId) {
+ this.defaultTimeZone = TimeZone.getTimeZone(ZoneId.of(timeZoneId));
+ }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index d77e373..c7368df 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -179,4 +179,16 @@ public class TestMetadataFilter extends AbstractTikaConfigTest {
assertNull(metadata.get("author"));
assertNull(metadata.get("a"));
}
+
+ @Test
+ public void testDateNormalizingFilter() throws Exception {
+ //test that a Date lacking a timezone, if interpreted as Los Angeles, for example,
+ //yields a UTC string that is properly +7 hours.
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.CREATED, "2021-07-23T01:02:24");
+ DateNormalizingMetadataFilter filter = new DateNormalizingMetadataFilter();
+ filter.setDefaultTimeZone("America/Los_Angeles");
+ filter.filter(m);
+ assertEquals("2021-07-23T08:02:24Z", m.get(TikaCoreProperties.CREATED));
+ }
}
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
index af0b53b..df9452a 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
@@ -45,6 +45,11 @@
</parser>
</parsers>
<metadataFilters>
+ <!-- depending on the file format, some dates do not have a timezone. This
+ filter arbitrarily assumes dates have a UTC timezone and will format all
+ dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone.
+ -->
+ <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
<metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
<params>
<excludeUnmapped>true</excludeUnmapped>
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
index 5f2740f..7517a9c 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
@@ -45,6 +45,11 @@
</parser>
</parsers>
<metadataFilters>
+ <!-- depending on the file format, some dates do not have a timezone. This
+ filter arbitrarily assumes dates have a UTC timezone and will format all
+ dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone.
+ -->
+ <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
<metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
<params>
<excludeUnmapped>true</excludeUnmapped>
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
new file mode 100644
index 0000000..779f8bb
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.emitter.solr;
+
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.FieldNameMappingFilter;
+
+/**
+ * This is meant only for one off development tests with a locally
+ * running instance of Solr. Please add unit tests to the
+ * tika-integration-tests/solr-*
+ */
+@Ignore
+public class SolrEmitterDevTest {
+
+ @Test
+ public void oneOff() throws Exception {
+ String core = "tika-example";
+ String url = "http://localhost:8983/solr";
+ String emitKey = "one-off-test-doc";
+ SolrEmitter solrEmitter = new SolrEmitter();
+ solrEmitter.setSolrUrls(Collections.singletonList(url));
+ solrEmitter.setSolrCollection(core);
+ solrEmitter.initialize(Collections.EMPTY_MAP);
+
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.CREATED, new Date());
+ metadata.set(TikaCoreProperties.TIKA_CONTENT, "the quick brown fox");
+
+ Map<String, String> mappings = new HashMap();
+ FieldNameMappingFilter filter = new FieldNameMappingFilter();
+ mappings.put(TikaCoreProperties.CREATED.getName(), "created");
+ mappings.put(TikaCoreProperties.TIKA_CONTENT.getName(), "content");
+ filter.setMappings(mappings);
+ filter.filter(metadata);
+
+ solrEmitter.emit(emitKey, Collections.singletonList(metadata));
+ }
+}