You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 12:50:36 UTC
svn commit: r903187 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/odf/ main/java/org/apache/tika/parser/xml/
test/java/org/apache/tika/parser/opendocument/ test/resources/test-documents/
Author: jukka
Date: Tue Jan 26 11:50:36 2010
New Revision: 903187
URL: http://svn.apache.org/viewvc?rev=903187&view=rev
Log:
TIKA-365: Extract more OpenDocument metadata
Patch by Nick Burch
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf (with props)
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=903187&r1=903186&r2=903187&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Tue Jan 26 11:50:36 2010
@@ -17,6 +17,7 @@
package org.apache.tika.parser.odf;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.parser.xml.MetadataHandler;
import org.apache.tika.sax.TeeContentHandler;
@@ -33,6 +34,8 @@
private static final XPathParser META_XPATH = new XPathParser(
"meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
+ // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+ public static final String USER_DEFINED_METADATA_NAME_PREFIX = "custom:";
private static ContentHandler getMeta(
ContentHandler ch, Metadata md, String name, String element) {
@@ -44,6 +47,17 @@
return new TeeContentHandler(ch, branch);
}
+ private static ContentHandler getUserDefined(
+ ContentHandler ch, Metadata md) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:user-defined/@meta:name"),
+ META_XPATH.parse("//meta:user-defined//text()"));
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeDependantMetadataHandler(md, "meta:name", USER_DEFINED_METADATA_NAME_PREFIX),
+ matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
@@ -54,9 +68,18 @@
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md) {
+ // Process the Dublin Core Attributes
ch = super.getContentHandler(ch, md);
+ // Process the OO Meta Attributes
+ ch = getMeta(ch, md, Metadata.CREATION_DATE, "creation-date");
ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
+ ch = getMeta(ch, md, Metadata.EDIT_TIME, "editing-duration");
+ ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
+ ch = getMeta(ch, md, "initial-creator", "initial-creator");
ch = getMeta(ch, md, "generator", "generator");
+ // Process the user defined Meta Attributes
+ ch = getUserDefined(ch, md);
+ // Process the OO Statistics Attributes
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java?rev=903187&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java Tue Jan 26 11:50:36 2010
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ * value, and the Metadata name is taken from
+ * an attribute, with a prefix if required.
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final String nameHoldingAttribute;
+ private final String namePrefix;
+ private String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+ this.metadata = metadata;
+ this.nameHoldingAttribute = nameHoldingAttribute;
+ this.namePrefix = namePrefix;
+ }
+
+ public void addMetadata(String value) {
+ if(name == null || name.length() == 0) {
+ // We didn't find the attribute which holds the name
+ return;
+ }
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+ metadata.set(name, value);
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ String rawName = attributes.getValue(nameHoldingAttribute);
+ if (rawName != null) {
+ if (namePrefix == null) {
+ this.name = rawName;
+ } else {
+ this.name = namePrefix + rawName;
+ }
+ }
+ // All other attributes are ignored
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=903187&r1=903186&r2=903187&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Tue Jan 26 11:50:36 2010
@@ -20,6 +20,12 @@
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
+/**
+ * This adds Metadata entries with a specified name for
+ * the textual content of a node (if present), and
+ * all attribute values passed through the matcher
+ * (but not their names).
+ */
public class MetadataHandler extends DefaultHandler {
private final Metadata metadata;
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java?rev=903187&r1=903186&r2=903187&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/opendocument/OpenOfficeParserTest.java Tue Jan 26 11:50:36 2010
@@ -38,7 +38,9 @@
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
+ assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
+ assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
assertEquals(
"NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
metadata.get("generator"));
@@ -49,6 +51,12 @@
assertEquals("1", metadata.get("nbPara"));
assertEquals("14", metadata.get("nbWord"));
assertEquals("78", metadata.get("nbCharacter"));
+
+ // Custom metadata tags present but without values
+ assertEquals(null, metadata.get("custom:Info 1"));
+ assertEquals(null, metadata.get("custom:Info 2"));
+ assertEquals(null, metadata.get("custom:Info 3"));
+ assertEquals(null, metadata.get("custom:Info 4"));
String content = handler.toString();
assertTrue(content.contains(
@@ -59,4 +67,104 @@
}
}
+ /**
+ * Similar to {@link #testXMLParser()}, but using a different
+ * OO2 file with different metadata in it
+ */
+ public void testOO2Metadata() throws Exception {
+ InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
+ "/test-documents/testOpenOffice2.odf");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OpenOfficeParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.oasis.opendocument.formula",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(null, metadata.get(Metadata.DATE));
+ assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+ assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals("1", metadata.get("editing-cycles"));
+ assertEquals(
+ "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
+ metadata.get("generator"));
+ assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+
+ // User defined metadata
+ assertEquals("Text 1", metadata.get("custom:Info 1"));
+ assertEquals("2", metadata.get("custom:Info 2"));
+ assertEquals("false", metadata.get("custom:Info 3"));
+ assertEquals("true", metadata.get("custom:Info 4"));
+
+ // No statistics present
+ assertEquals(null, metadata.get("nbTab"));
+ assertEquals(null, metadata.get("nbObject"));
+ assertEquals(null, metadata.get("nbImg"));
+ assertEquals(null, metadata.get("nbPage"));
+ assertEquals(null, metadata.get("nbPara"));
+ assertEquals(null, metadata.get("nbWord"));
+ assertEquals(null, metadata.get("nbCharacter"));
+
+ // Note - contents of maths files not currently supported
+ String content = handler.toString();
+ assertEquals("", content);
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Similar to {@link #testXMLParser()}, but using an OO3 file
+ */
+ public void testOO3Metadata() throws Exception {
+ InputStream input = OpenOfficeParserTest.class.getResourceAsStream(
+ "/test-documents/testODFwithOOo3.odt");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OpenOfficeParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2009-10-05T21:22:38", metadata.get(Metadata.DATE));
+ assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("Apache Tika", metadata.get(Metadata.TITLE));
+ assertEquals("Test document", metadata.get(Metadata.SUBJECT));
+ assertEquals("A rather complex document", metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Bart Hanssens", metadata.get(Metadata.CREATOR));
+ assertEquals("Bart Hanssens", metadata.get("initial-creator"));
+ assertEquals("2", metadata.get("editing-cycles"));
+ assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals(
+ "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
+ metadata.get("generator"));
+ assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
+
+ // User defined metadata
+ assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
+ assertEquals(null, metadata.get("custom:Info 2"));
+ assertEquals(null, metadata.get("custom:Info 3"));
+ assertEquals(null, metadata.get("custom:Info 4"));
+
+ // No statistics present
+ assertEquals("0", metadata.get("nbTab"));
+ assertEquals("2", metadata.get("nbObject"));
+ assertEquals("0", metadata.get("nbImg"));
+ assertEquals("2", metadata.get("nbPage"));
+ assertEquals("13", metadata.get("nbPara"));
+ assertEquals("54", metadata.get("nbWord"));
+ assertEquals("351", metadata.get("nbCharacter"));
+
+ String content = handler.toString();
+ assertTrue(content.contains(
+ "Apache Tika Tika is part of the Lucene project."
+ ));
+ } finally {
+ input.close();
+ }
+ }
}
Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf?rev=903187&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream