You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2013/06/11 05:16:27 UTC

svn commit: r1491680 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/xml/ tika-parsers/src/test/java/org/apache/tika/parser/xml/ tika-parsers/src/test/resources/test-documents/

Author: rgauss
Date: Tue Jun 11 03:16:26 2013
New Revision: 1491680

URL: http://svn.apache.org/r1491680
Log:
TIKA-1133: Ability to Allow Empty and Duplicate Tika Values for XML Elements
   - Added constructors in ElementMetadataHandler to specify allowing duplicates and empty values
   - Added a unit test and test data which confirms the default and override behaviors

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1491680&r1=1491679&r2=1491680&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Jun 11 03:16:26 2013
@@ -31,6 +31,9 @@ Release 1.4 Current Development
   * MS Word: line tabular character is now replaced with newline
     (TIKA-1128)
 
+  * XML: ElementMetadataHandlers can now optionally accept duplicate
+    and empty values (TIKA-1133)
+
 Release 1.3 - 01/19/2013
 
   * Mimetype definitions added for more common programming languages,

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?rev=1491680&r1=1491679&r2=1491680&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java Tue Jun 11 03:16:26 2013
@@ -49,6 +49,9 @@ public class ElementMetadataHandler exte
 
     private final String name;
     private Property targetProperty;
+    
+    private final boolean allowDuplicateValues;
+    private final boolean allowEmptyValues;
 
     /**
      * The buffer used to capture characters when inside a bag li element.
@@ -68,6 +71,14 @@ public class ElementMetadataHandler exte
     private int matchLevel = 0;
     private int parentMatchLevel = 0;
 
+    /**
+     * Constructor for string metadata keys.
+     * 
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     */
     public ElementMetadataHandler(
             String uri, String localName, Metadata metadata, String name) {
         super(metadata, name);
@@ -75,11 +86,46 @@ public class ElementMetadataHandler exte
         this.localName = localName;
         this.metadata = metadata;
         this.name = name;
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
         if (logger.isTraceEnabled()) {
     		logger.trace("created simple handler for " + this.name);
     	}
     }
+    
+    /**
+     * Constructor for string metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     * 
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        if (logger.isTraceEnabled()) {
+                logger.trace("created simple handler for " + this.name);
+        }
+    }
 
+    /**
+     * Constructor for Property metadata keys.
+     * 
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     */
     public ElementMetadataHandler(
             String uri, String localName, Metadata metadata, Property targetProperty) {
         super(metadata, targetProperty);
@@ -88,10 +134,38 @@ public class ElementMetadataHandler exte
         this.metadata = metadata;
         this.targetProperty = targetProperty;
         this.name = targetProperty.getName();
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
         if (logger.isTraceEnabled()) {
     		logger.trace("created property handler for " + this.name);
     	}
     }
+    
+    /**
+     * Constructor for Property metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     * 
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        if (logger.isTraceEnabled()) {
+                logger.trace("created property handler for " + this.name);
+        }
+    }
 
     protected boolean isMatchingParentElement(String uri, String localName) {
         return (uri.equals(this.uri) && localName.equals(this.localName));
@@ -165,9 +239,12 @@ public class ElementMetadataHandler exte
             logger.trace("adding " + name + "=" + value);
         }
         if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
-            if (value != null && value.length() > 0) {
+            if ((value != null && value.length() > 0) || allowEmptyValues) {
+                if (value == null || value.length() == 0 && allowEmptyValues) {
+                    value = "";
+                }
                 String[] previous = metadata.getValues(name);
-                if (previous == null || !Arrays.asList(previous).contains(value)) {
+                if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
                     metadata.add(targetProperty, value);
                 }
             }

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1491680&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Tue Jun 11 03:16:26 2013
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+    
+    private Property FIRST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+    private Property LAST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+    public void testDefaultBehavior() throws Exception {
+        InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML3.xml");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+            
+            assertEquals(4, metadata.getValues(FIRST_NAME).length);
+            assertEquals(2, metadata.getValues(LAST_NAME).length);
+            
+            assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+            
+            assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+            assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+            
+            // We didn't know Bob's last name, but now we don't know an entry existed
+            assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+            
+            // We don't know Kate's last name because it was a duplicate
+            assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+        } finally {
+            input.close();
+        }
+    }
+    
+    public void testEmptiesAndRepeats() throws Exception {
+        InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML3.xml");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+            
+            assertEquals(4, metadata.getValues(FIRST_NAME).length);
+            assertEquals(4, metadata.getValues(LAST_NAME).length);
+            
+            assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+            
+            assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+            assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+            
+            assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+            assertEquals("", metadata.getValues(LAST_NAME)[2]);
+            
+            assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+        } finally {
+            input.close();
+        }
+    }
+    
+    private class DefaultCustomXMLTestParser extends XMLParser {
+    
+        private static final long serialVersionUID = 2458579047014545931L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaMetadata, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    (Property) tikaMetadata);
+        }
+        
+        protected ContentHandler getContentHandler(
+                ContentHandler handler, Metadata metadata, ParseContext context) {
+            return new TeeContentHandler(
+                    super.getContentHandler(handler, metadata, context),
+                    getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+                    getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+        }
+    }
+    
+    private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+        
+        private static final long serialVersionUID = 3735646809954466229L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaMetadata, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    (Property) tikaMetadata,
+                    true,
+                    true);
+        }
+    }
+    
+    
+}

Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml?rev=1491680&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml Tue Jun 11 03:16:26 2013
@@ -0,0 +1,23 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
+
+<rdf:Description rdf:about='test.jpg'
+  xmlns:custom='http://custom'>
+ <custom:FirstName>
+  <rdf:Bag>
+   <rdf:li>John</rdf:li>
+   <rdf:li>Jane</rdf:li>
+   <rdf:li>Bob</rdf:li>
+   <rdf:li>Kate</rdf:li>
+  </rdf:Bag>
+ </custom:FirstName>
+ <custom:LastName>
+  <rdf:Bag>
+   <rdf:li>Smith</rdf:li>
+   <rdf:li>Doe</rdf:li>
+   <rdf:li></rdf:li>
+   <rdf:li>Smith</rdf:li>
+  </rdf:Bag>
+ </custom:LastName>
+</rdf:Description>
+</rdf:RDF>
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML3.xml
------------------------------------------------------------------------------
    svn:eol-style = native