You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/05/17 18:58:57 UTC
svn commit: r1339710 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/metadata/
tika-parsers/src/main/java/org/apache/tika/parser/odf/
tika-parsers/src/main/java/org/apache/tika/parser/xml/
tika-parsers/src/test/java/org/apache/tika/parser/odf/
Author: nick
Date: Thu May 17 16:58:57 2012
New Revision: 1339710
URL: http://svn.apache.org/viewvc?rev=1339710&view=rev
Log:
TIKA-929 Update the ODF Parser to use the new style Office properties
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java Thu May 17 16:58:57 2012
@@ -60,39 +60,39 @@ public interface MSOffice {
/** The number of Slides are there in the (presentation) document */
- Property SLIDE_COUNT =
+ @Deprecated Property SLIDE_COUNT =
Property.internalInteger("Slide-Count");
/** The number of Pages are there in the (paged) document */
- Property PAGE_COUNT =
+ @Deprecated Property PAGE_COUNT =
Property.internalInteger("Page-Count");
/** The number of individual Paragraphs in the document */
- Property PARAGRAPH_COUNT =
+ @Deprecated Property PARAGRAPH_COUNT =
Property.internalInteger("Paragraph-Count");
/** The number of lines in the document */
- Property LINE_COUNT =
+ @Deprecated Property LINE_COUNT =
Property.internalInteger("Line-Count");
/** The number of Words in the document */
- Property WORD_COUNT =
+ @Deprecated Property WORD_COUNT =
Property.internalInteger("Word-Count");
/** The number of Characters in the document */
- Property CHARACTER_COUNT =
+ @Deprecated Property CHARACTER_COUNT =
Property.internalInteger("Character Count");
/** The number of Characters in the document, including spaces */
- Property CHARACTER_COUNT_WITH_SPACES =
+ @Deprecated Property CHARACTER_COUNT_WITH_SPACES =
Property.internalInteger("Character-Count-With-Spaces");
/** The number of Tables in the document */
- Property TABLE_COUNT =
+ @Deprecated Property TABLE_COUNT =
Property.internalInteger("Table-Count");
/** The number of Images in the document */
- Property IMAGE_COUNT =
+ @Deprecated Property IMAGE_COUNT =
Property.internalInteger("Image-Count");
/**
@@ -100,7 +100,7 @@ public interface MSOffice {
* This is typically non-Image resources embedded in the
* document, such as other documents or non-Image media.
*/
- Property OBJECT_COUNT =
+ @Deprecated Property OBJECT_COUNT =
Property.internalInteger("Object-Count");
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Thu May 17 16:58:57 2012
@@ -16,12 +16,16 @@
*/
package org.apache.tika.parser.odf;
+import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
-import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.parser.xml.MetadataHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.xpath.CompositeMatcher;
@@ -43,12 +47,12 @@ public class OpenDocumentMetaParser exte
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
private static ContentHandler getMeta(
- ContentHandler ch, Metadata md, String name, String element) {
+ ContentHandler ch, Metadata md, Property property, String element) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
- new MatchingContentHandler(new MetadataHandler(md, name), matcher);
+ new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
@@ -64,7 +68,7 @@ public class OpenDocumentMetaParser exte
return new TeeContentHandler(ch, branch);
}
- private static ContentHandler getStatistic(
+ @Deprecated private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
@@ -72,30 +76,50 @@ public class OpenDocumentMetaParser exte
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, Property property, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
// Process the Dublin Core Attributes
ch = super.getContentHandler(ch, md, context);
+
// Process the OO Meta Attributes
- ch = getMeta(ch, md, Metadata.CREATION_DATE.getName(), "creation-date");
- ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
- ch = getMeta(ch, md, Metadata.EDIT_TIME, "editing-duration");
- ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
- ch = getMeta(ch, md, "initial-creator", "initial-creator");
- ch = getMeta(ch, md, "generator", "generator");
+ ch = getMeta(ch, md, TikaCoreProperties.CREATION_DATE, "creation-date");
+ ch = getMeta(ch, md, TikaCoreProperties.KEYWORDS, "keyword");
+
+ ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+ ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+ ch = getMeta(ch, md, Property.externalText("initial-creator"), "initial-creator");
+ ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
ch = getUserDefined(ch, md);
// Process the OO Statistics Attributes
- ch = getStatistic(ch, md, Metadata.OBJECT_COUNT.getName(), "object-count");
- ch = getStatistic(ch, md, Metadata.IMAGE_COUNT.getName(), "image-count");
- ch = getStatistic(ch, md, Metadata.PAGE_COUNT.getName(), "page-count");
- ch = getStatistic(ch, md, PagedText.N_PAGES.getName(), "page-count");
- ch = getStatistic(ch, md, Metadata.TABLE_COUNT.getName(), "table-count");
- ch = getStatistic(ch, md, Metadata.PARAGRAPH_COUNT.getName(), "paragraph-count");
- ch = getStatistic(ch, md, Metadata.WORD_COUNT.getName(), "word-count");
- ch = getStatistic(ch, md, Metadata.CHARACTER_COUNT.getName(), "character-count");
+ ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+ ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+ // Legacy, Tika-1.0 style attributes
+ // TODO Remove these in Tika 2.0
+ ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java Thu May 17 16:58:57 2012
@@ -20,6 +20,7 @@ import java.util.Arrays;
import java.util.List;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.xml.sax.helpers.DefaultHandler;
/**
@@ -30,13 +31,19 @@ import org.xml.sax.helpers.DefaultHandle
class AbstractMetadataHandler extends DefaultHandler {
private final Metadata metadata;
-
+ private final Property property;
private final String name;
protected AbstractMetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
+ this.property = null;
this.name = name;
}
+ protected AbstractMetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
/**
* Adds the given metadata value. The value is ignored if it is
@@ -51,20 +58,31 @@ class AbstractMetadataHandler extends De
// Add the value, assuming it's not already there
List<String> previous = Arrays.asList(metadata.getValues(name));
if (!previous.contains(value)) {
- metadata.add(name, value);
+ if (property != null) {
+ metadata.add(property, value);
+ } else {
+ metadata.add(name, value);
+ }
}
} else {
// Set the value, assuming it's not already there
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
if (!previous.equals(value)) {
- metadata.add(name, value);
+ if (property != null) {
+ metadata.add(property, value);
+ } else {
+ metadata.add(name, value);
+ }
}
} else {
- metadata.set(name, value);
+ if (property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
}
}
}
}
-
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java Thu May 17 16:58:57 2012
@@ -17,6 +17,7 @@
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@@ -38,6 +39,12 @@ public class AttributeMetadataHandler ex
this.uri = uri;
this.localName = localName;
}
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, Property property) {
+ super(metadata, property);
+ this.uri = uri;
+ this.localName = localName;
+ }
@Override
public void startElement(
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Thu May 17 16:58:57 2012
@@ -17,6 +17,7 @@
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
@@ -33,14 +34,21 @@ public class MetadataHandler extends Def
private final Metadata metadata;
+ private final Property property;
private final String name;
private final StringBuilder buffer = new StringBuilder();
public MetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
+ this.property = null;
this.name = name;
}
+ public MetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
public void addMetadata(String value) {
if (value.length() > 0) {
@@ -48,7 +56,12 @@ public class MetadataHandler extends Def
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
- metadata.set(name, value);
+
+ if (this.property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Thu May 17 16:58:57 2012
@@ -20,6 +20,7 @@ import java.io.InputStream;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -85,6 +86,15 @@ public class ODFParserTest extends TikaT
metadata.get("generator"));
// Check the document statistics
+ assertEquals("1", metadata.get(Office.PAGE_COUNT));
+ assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
+ assertEquals("14", metadata.get(Office.WORD_COUNT));
+ assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Office.TABLE_COUNT));
+ assertEquals("0", metadata.get(Office.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+ // Check the Tika-1.0 style document statistics
assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
assertEquals("14", metadata.get(Metadata.WORD_COUNT));
@@ -93,7 +103,7 @@ public class ODFParserTest extends TikaT
assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
- // Check the old style statistics (these will be removed shortly)
+ // Check the very old style statistics (these will be removed shortly)
assertEquals("0", metadata.get("nbTab"));
assertEquals("0", metadata.get("nbObject"));
assertEquals("0", metadata.get("nbImg"));
@@ -209,6 +219,15 @@ public class ODFParserTest extends TikaT
assertEquals(null, metadata.get("custom:Info 4"));
// Check the document statistics
+ assertEquals("2", metadata.get(Office.PAGE_COUNT));
+ assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
+ assertEquals("54", metadata.get(Office.WORD_COUNT));
+ assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Office.TABLE_COUNT));
+ assertEquals("2", metadata.get(Office.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+ // Check the Tika-1.0 style document statistics
assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
assertEquals("54", metadata.get(Metadata.WORD_COUNT));