You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2013/06/11 05:16:27 UTC
svn commit: r1491680 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/xml/
tika-parsers/src/test/java/org/apache/tika/parser/xml/
tika-parsers/src/test/resources/test-documents/
Author: rgauss
Date: Tue Jun 11 03:16:26 2013
New Revision: 1491680
URL: http://svn.apache.org/r1491680
Log:
TIKA-1133: Ability to Allow Empty and Duplicate Tika Values for XML Elements
- Added constructors in ElementMetadataHandler to specify allowing duplicates and empty values
- Added a unit test and test data which confirms the default and override behaviors
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1491680&r1=1491679&r2=1491680&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Jun 11 03:16:26 2013
@@ -31,6 +31,9 @@ Release 1.4 Current Development
* MS Word: line tabular character is now replaced with newline
(TIKA-1128)
+ * XML: ElementMetadataHandlers can now optionally accept duplicate
+ and empty values (TIKA-1133)
+
Release 1.3 - 01/19/2013
* Mimetype definitions added for more common programming languages,
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?rev=1491680&r1=1491679&r2=1491680&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java Tue Jun 11 03:16:26 2013
@@ -49,6 +49,9 @@ public class ElementMetadataHandler exte
private final String name;
private Property targetProperty;
+
+ private final boolean allowDuplicateValues;
+ private final boolean allowEmptyValues;
/**
* The buffer used to capture characters when inside a bag li element.
@@ -68,6 +71,14 @@ public class ElementMetadataHandler exte
private int matchLevel = 0;
private int parentMatchLevel = 0;
+ /**
+ * Constructor for string metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ */
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
@@ -75,11 +86,46 @@ public class ElementMetadataHandler exte
this.localName = localName;
this.metadata = metadata;
this.name = name;
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
if (logger.isTraceEnabled()) {
logger.trace("created simple handler for " + this.name);
}
}
+
+ /**
+ * Constructor for string metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created simple handler for " + this.name);
+ }
+ }
+ /**
+ * Constructor for Property metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ */
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty) {
super(metadata, targetProperty);
@@ -88,10 +134,38 @@ public class ElementMetadataHandler exte
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
if (logger.isTraceEnabled()) {
logger.trace("created property handler for " + this.name);
}
}
+
+ /**
+ * Constructor for Property metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ if (logger.isTraceEnabled()) {
+ logger.trace("created property handler for " + this.name);
+ }
+ }
protected boolean isMatchingParentElement(String uri, String localName) {
return (uri.equals(this.uri) && localName.equals(this.localName));
@@ -165,9 +239,12 @@ public class ElementMetadataHandler exte
logger.trace("adding " + name + "=" + value);
}
if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
- if (value != null && value.length() > 0) {
+ if ((value != null && value.length() > 0) || allowEmptyValues) {
+ if (value == null || value.length() == 0 && allowEmptyValues) {
+ value = "";
+ }
String[] previous = metadata.getValues(name);
- if (previous == null || !Arrays.asList(previous).contains(value)) {
+ if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1491680&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Tue Jun 11 03:16:26 2013
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+
+ private Property FIRST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+ private Property LAST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+ public void testDefaultBehavior() throws Exception {
+ InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML3.xml");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ // We didn't know Bob's last name, but now we don't know an entry existed
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+ // We don't know Kate's last name because it was a duplicate
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testEmptiesAndRepeats() throws Exception {
+ InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML3.xml");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+ assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+ } finally {
+ input.close();
+ }
+ }
+
+ private class DefaultCustomXMLTestParser extends XMLParser {
+
+ private static final long serialVersionUID = 2458579047014545931L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaMetadata, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ (Property) tikaMetadata);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+ getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+ }
+ }
+
+ private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+
+ private static final long serialVersionUID = 3735646809954466229L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaMetadata, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ (Property) tikaMetadata,
+ true,
+ true);
+ }
+ }
+
+
+}
Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml?rev=1491680&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml Tue Jun 11 03:16:26 2013
@@ -0,0 +1,23 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
+
+<rdf:Description rdf:about='test.jpg'
+ xmlns:custom='http://custom'>
+ <custom:FirstName>
+ <rdf:Bag>
+ <rdf:li>John</rdf:li>
+ <rdf:li>Jane</rdf:li>
+ <rdf:li>Bob</rdf:li>
+ <rdf:li>Kate</rdf:li>
+ </rdf:Bag>
+ </custom:FirstName>
+ <custom:LastName>
+ <rdf:Bag>
+ <rdf:li>Smith</rdf:li>
+ <rdf:li>Doe</rdf:li>
+ <rdf:li></rdf:li>
+ <rdf:li>Smith</rdf:li>
+ </rdf:Bag>
+ </custom:LastName>
+</rdf:Description>
+</rdf:RDF>
\ No newline at end of file
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml
------------------------------------------------------------------------------
svn:eol-style = native