You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 12:50:36 UTC

svn commit: r903187 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/odf/ main/java/org/apache/tika/parser/xml/ test/java/org/apache/tika/parser/opendocument/ test/resources/test-documents/

Author: jukka
Date: Tue Jan 26 11:50:36 2010
New Revision: 903187

URL: http://svn.apache.org/viewvc?rev=903187&view=rev
Log:
TIKA-365: Extract more OpenDocument metadata

Patch by Nick Burch

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf   (with props)
Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=903187&r1=903186&r2=903187&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Tue Jan 26 11:50:36 2010
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.odf;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
 import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.parser.xml.MetadataHandler;
 import org.apache.tika.sax.TeeContentHandler;
@@ -33,6 +34,8 @@
 
     private static final XPathParser META_XPATH = new XPathParser(
             "meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
+    // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+    public static final String USER_DEFINED_METADATA_NAME_PREFIX = "custom:"; 
 
     private static ContentHandler getMeta(
             ContentHandler ch, Metadata md, String name, String element) {
@@ -44,6 +47,17 @@
         return new TeeContentHandler(ch, branch);
     }
 
+    private static ContentHandler getUserDefined(
+            ContentHandler ch, Metadata md) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:user-defined/@meta:name"),
+                META_XPATH.parse("//meta:user-defined//text()"));
+        ContentHandler branch = new MatchingContentHandler(
+              new AttributeDependantMetadataHandler(md, "meta:name", USER_DEFINED_METADATA_NAME_PREFIX),
+              matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
     private static ContentHandler getStatistic(
             ContentHandler ch, Metadata md, String name, String attribute) {
         Matcher matcher =
@@ -54,9 +68,18 @@
     }
 
     protected ContentHandler getContentHandler(ContentHandler ch, Metadata md) {
+        // Process the Dublin Core Attributes 
         ch = super.getContentHandler(ch, md);
+        // Process the OO Meta Attributes
+        ch = getMeta(ch, md, Metadata.CREATION_DATE, "creation-date");
         ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
+        ch = getMeta(ch, md, Metadata.EDIT_TIME, "editing-duration");
+        ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
+        ch = getMeta(ch, md, "initial-creator", "initial-creator");
         ch = getMeta(ch, md, "generator", "generator");
+        // Process the user defined Meta Attributes
+        ch = getUserDefined(ch, md);
+        // Process the OO Statistics Attributes
         ch = getStatistic(ch, md, "nbTab", "table-count");
         ch = getStatistic(ch, md, "nbObject", "object-count");
         ch = getStatistic(ch, md, "nbImg", "image-count");

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java?rev=903187&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java Tue Jan 26 11:50:36 2010
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ *  value, and the Metadata name is taken from
+ *  an attribute, with a prefix if required. 
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final String nameHoldingAttribute;
+    private final String namePrefix;
+    private String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+        this.metadata = metadata;
+        this.nameHoldingAttribute = nameHoldingAttribute;
+        this.namePrefix = namePrefix;
+    }
+
+    public void addMetadata(String value) {
+        if(name == null || name.length() == 0) {
+           // We didn't find the attribute which holds the name
+           return;
+        }
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            metadata.set(name, value);
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        String rawName = attributes.getValue(nameHoldingAttribute);
+        if (rawName != null) {
+           if (namePrefix == null) {
+              this.name = rawName;
+           } else {
+              this.name = namePrefix + rawName;
+           }
+        }
+        // All other attributes are ignored
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=903187&r1=903186&r2=903187&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Tue Jan 26 11:50:36 2010
@@ -20,6 +20,12 @@
 import org.xml.sax.Attributes;
 import org.xml.sax.helpers.DefaultHandler;
 
+/**
+ * This adds Metadata entries with a specified name for
+ *  the textual content of a node (if present), and 
+ *  all attribute values passed through the matcher
+ *  (but not their names). 
+ */
 public class MetadataHandler extends DefaultHandler {
 
     private final Metadata metadata;

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java?rev=903187&r1=903186&r2=903187&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java Tue Jan 26 11:50:36 2010
@@ -38,7 +38,9 @@
                     "application/vnd.oasis.opendocument.text",
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
+            assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
             assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
+            assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
             assertEquals(
                     "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
                     metadata.get("generator"));
@@ -49,6 +51,12 @@
             assertEquals("1", metadata.get("nbPara"));
             assertEquals("14", metadata.get("nbWord"));
             assertEquals("78", metadata.get("nbCharacter"));
+            
+            // Custom metadata tags present but without values
+            assertEquals(null, metadata.get("custom:Info 1"));
+            assertEquals(null, metadata.get("custom:Info 2"));
+            assertEquals(null, metadata.get("custom:Info 3"));
+            assertEquals(null, metadata.get("custom:Info 4"));
 
             String content = handler.toString();
             assertTrue(content.contains(
@@ -59,4 +67,104 @@
         }
     }
 
+    /**
+     * Similar to {@link #testXMLParser()}, but using a different
+     *  OO2 file with different metadata in it
+     */
+    public void testOO2Metadata() throws Exception {
+       InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
+             "/test-documents/testOpenOffice2.odf");
+       try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OpenOfficeParser().parse(input, handler, metadata);
+   
+            assertEquals(
+                    "application/vnd.oasis.opendocument.formula",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals(null, metadata.get(Metadata.DATE));
+            assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
+            assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
+            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+            assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
+            assertEquals("1", metadata.get("editing-cycles"));
+            assertEquals(
+                    "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
+                    metadata.get("generator"));
+            assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+            
+            // User defined metadata
+            assertEquals("Text 1", metadata.get("custom:Info 1"));
+            assertEquals("2", metadata.get("custom:Info 2"));
+            assertEquals("false", metadata.get("custom:Info 3"));
+            assertEquals("true", metadata.get("custom:Info 4"));
+            
+            // No statistics present
+            assertEquals(null, metadata.get("nbTab"));
+            assertEquals(null, metadata.get("nbObject"));
+            assertEquals(null, metadata.get("nbImg"));
+            assertEquals(null, metadata.get("nbPage"));
+            assertEquals(null, metadata.get("nbPara"));
+            assertEquals(null, metadata.get("nbWord"));
+            assertEquals(null, metadata.get("nbCharacter"));
+   
+            // Note - contents of maths files not currently supported
+            String content = handler.toString();
+            assertEquals("", content);
+       } finally {
+           input.close();
+       }
+    }
+
+    /**
+     * Similar to {@link #testXMLParser()}, but using an OO3 file
+     */
+    public void testOO3Metadata() throws Exception {
+       InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
+             "/test-documents/testODFwithOOo3.odt");
+       try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OpenOfficeParser().parse(input, handler, metadata);
+   
+            assertEquals(
+                    "application/vnd.oasis.opendocument.text",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("2009-10-05T21:22:38", metadata.get(Metadata.DATE));
+            assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
+            assertEquals("Apache Tika", metadata.get(Metadata.TITLE));
+            assertEquals("Test document", metadata.get(Metadata.SUBJECT));
+            assertEquals("A rather complex document", metadata.get(Metadata.DESCRIPTION));
+            assertEquals("Bart Hanssens", metadata.get(Metadata.CREATOR));
+            assertEquals("Bart Hanssens", metadata.get("initial-creator"));
+            assertEquals("2", metadata.get("editing-cycles"));
+            assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
+            assertEquals(
+                    "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
+                    metadata.get("generator"));
+            assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
+            
+            // User defined metadata
+            assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
+            assertEquals(null, metadata.get("custom:Info 2"));
+            assertEquals(null, metadata.get("custom:Info 3"));
+            assertEquals(null, metadata.get("custom:Info 4"));
+            
+            // No statistics present
+            assertEquals("0", metadata.get("nbTab"));
+            assertEquals("2", metadata.get("nbObject"));
+            assertEquals("0", metadata.get("nbImg"));
+            assertEquals("2", metadata.get("nbPage"));
+            assertEquals("13", metadata.get("nbPara"));
+            assertEquals("54", metadata.get("nbWord"));
+            assertEquals("351", metadata.get("nbCharacter"));
+   
+            String content = handler.toString();
+            assertTrue(content.contains(
+                  "Apache Tika Tika is part of the Lucene project."
+            ));
+       } finally {
+           input.close();
+       }
+    }
 }

Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf?rev=903187&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream