You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/05/17 18:58:57 UTC

svn commit: r1339710 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/metadata/ tika-parsers/src/main/java/org/apache/tika/parser/odf/ tika-parsers/src/main/java/org/apache/tika/parser/xml/ tika-parsers/src/test/java/org/apache/tika/parser/odf/

Author: nick
Date: Thu May 17 16:58:57 2012
New Revision: 1339710

URL: http://svn.apache.org/viewvc?rev=1339710&view=rev
Log:
TIKA-929 Update the ODF Parser to use the new style Office properties

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MSOffice.java Thu May 17 16:58:57 2012
@@ -60,39 +60,39 @@ public interface MSOffice {
 
     
     /** The number of Slides are there in the (presentation) document */
-    Property SLIDE_COUNT = 
+    @Deprecated Property SLIDE_COUNT = 
        Property.internalInteger("Slide-Count");
     
     /** The number of Pages are there in the (paged) document */
-    Property PAGE_COUNT = 
+    @Deprecated Property PAGE_COUNT = 
        Property.internalInteger("Page-Count");
 
     /** The number of individual Paragraphs in the document */ 
-    Property PARAGRAPH_COUNT = 
+    @Deprecated Property PARAGRAPH_COUNT = 
        Property.internalInteger("Paragraph-Count");
     
     /** The number of lines in the document */
-    Property LINE_COUNT = 
+    @Deprecated Property LINE_COUNT = 
        Property.internalInteger("Line-Count");
 
     /** The number of Words in the document */
-    Property WORD_COUNT = 
+    @Deprecated Property WORD_COUNT = 
        Property.internalInteger("Word-Count");
 
     /** The number of Characters in the document */
-    Property CHARACTER_COUNT = 
+    @Deprecated Property CHARACTER_COUNT = 
        Property.internalInteger("Character Count");
     
     /** The number of Characters in the document, including spaces */
-    Property CHARACTER_COUNT_WITH_SPACES = 
+    @Deprecated Property CHARACTER_COUNT_WITH_SPACES = 
        Property.internalInteger("Character-Count-With-Spaces");
 
     /** The number of Tables in the document */
-    Property TABLE_COUNT = 
+    @Deprecated Property TABLE_COUNT = 
        Property.internalInteger("Table-Count");
     
     /** The number of Images in the document */
-    Property IMAGE_COUNT = 
+    @Deprecated Property IMAGE_COUNT = 
        Property.internalInteger("Image-Count");
     
     /** 
@@ -100,7 +100,7 @@ public interface MSOffice {
      * This is typically non-Image resources embedded in the
      *  document, such as other documents or non-Image media. 
      */
-    Property OBJECT_COUNT = 
+    @Deprecated Property OBJECT_COUNT = 
        Property.internalInteger("Object-Count");
 
     

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java Thu May 17 16:58:57 2012
@@ -16,12 +16,16 @@
  */
 package org.apache.tika.parser.odf;
 
+import org.apache.tika.metadata.MSOffice;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
-import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.parser.xml.MetadataHandler;
 import org.apache.tika.sax.TeeContentHandler;
 import org.apache.tika.sax.xpath.CompositeMatcher;
@@ -43,12 +47,12 @@ public class OpenDocumentMetaParser exte
     private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
 
     private static ContentHandler getMeta(
-            ContentHandler ch, Metadata md, String name, String element) {
+            ContentHandler ch, Metadata md, Property property, String element) {
         Matcher matcher = new CompositeMatcher(
                 META_XPATH.parse("//meta:" + element),
                 META_XPATH.parse("//meta:" + element + "//text()"));
         ContentHandler branch =
-            new MatchingContentHandler(new MetadataHandler(md, name), matcher);
+            new MatchingContentHandler(new MetadataHandler(md, property), matcher);
         return new TeeContentHandler(ch, branch);
     }
 
@@ -64,7 +68,7 @@ public class OpenDocumentMetaParser exte
         return new TeeContentHandler(ch, branch);
     }
 
-    private static ContentHandler getStatistic(
+    @Deprecated private static ContentHandler getStatistic(
             ContentHandler ch, Metadata md, String name, String attribute) {
         Matcher matcher =
             META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
@@ -72,30 +76,50 @@ public class OpenDocumentMetaParser exte
               new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
         return new TeeContentHandler(ch, branch);
     }
+    private static ContentHandler getStatistic(
+          ContentHandler ch, Metadata md, Property property, String attribute) {
+      Matcher matcher =
+          META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
+      ContentHandler branch = new MatchingContentHandler(
+            new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+      return new TeeContentHandler(ch, branch);
+  }
 
     protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
         // Process the Dublin Core Attributes 
         ch = super.getContentHandler(ch, md, context);
+        
         // Process the OO Meta Attributes
-        ch = getMeta(ch, md, Metadata.CREATION_DATE.getName(), "creation-date");
-        ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
-        ch = getMeta(ch, md, Metadata.EDIT_TIME, "editing-duration");
-        ch = getMeta(ch, md, "editing-cycles", "editing-cycles");
-        ch = getMeta(ch, md, "initial-creator", "initial-creator");
-        ch = getMeta(ch, md, "generator", "generator");
+        ch = getMeta(ch, md, TikaCoreProperties.CREATION_DATE, "creation-date");
+        ch = getMeta(ch, md, TikaCoreProperties.KEYWORDS, "keyword");
+        
+        ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");        
+        ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+        ch = getMeta(ch, md, Property.externalText("initial-creator"), "initial-creator");
+        ch = getMeta(ch, md, Property.externalText("generator"), "generator");
         
         // Process the user defined Meta Attributes
         ch = getUserDefined(ch, md);
         
         // Process the OO Statistics Attributes
-        ch = getStatistic(ch, md, Metadata.OBJECT_COUNT.getName(), "object-count");
-        ch = getStatistic(ch, md, Metadata.IMAGE_COUNT.getName(),  "image-count");
-        ch = getStatistic(ch, md, Metadata.PAGE_COUNT.getName(),   "page-count");
-        ch = getStatistic(ch, md, PagedText.N_PAGES.getName(),     "page-count");
-        ch = getStatistic(ch, md, Metadata.TABLE_COUNT.getName(),  "table-count");
-        ch = getStatistic(ch, md, Metadata.PARAGRAPH_COUNT.getName(), "paragraph-count");
-        ch = getStatistic(ch, md, Metadata.WORD_COUNT.getName(),      "word-count");
-        ch = getStatistic(ch, md, Metadata.CHARACTER_COUNT.getName(), "character-count");
+        ch = getStatistic(ch, md, Office.OBJECT_COUNT,  "object-count");
+        ch = getStatistic(ch, md, Office.IMAGE_COUNT,   "image-count");
+        ch = getStatistic(ch, md, Office.PAGE_COUNT,    "page-count");
+        ch = getStatistic(ch, md, PagedText.N_PAGES,    "page-count");
+        ch = getStatistic(ch, md, Office.TABLE_COUNT,   "table-count");
+        ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, Office.WORD_COUNT,      "word-count");
+        ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+        
+        // Legacy, Tika-1.0 style attributes
+        // TODO Remove these in Tika 2.0
+        ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT,  "object-count");
+        ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT,   "image-count");
+        ch = getStatistic(ch, md, MSOffice.PAGE_COUNT,    "page-count");
+        ch = getStatistic(ch, md, MSOffice.TABLE_COUNT,   "table-count");
+        ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, MSOffice.WORD_COUNT,      "word-count");
+        ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
         
         // Legacy Statistics Attributes, replaced with real keys above
         // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java Thu May 17 16:58:57 2012
@@ -20,6 +20,7 @@ import java.util.Arrays;
 import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
@@ -30,13 +31,19 @@ import org.xml.sax.helpers.DefaultHandle
 class AbstractMetadataHandler extends DefaultHandler {
 
     private final Metadata metadata;
-
+    private final Property property;
     private final String name;
 
     protected AbstractMetadataHandler(Metadata metadata, String name) {
         this.metadata = metadata;
+        this.property = null;
         this.name = name;
     }
+    protected AbstractMetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
 
     /**
      * Adds the given metadata value. The value is ignored if it is
@@ -51,20 +58,31 @@ class AbstractMetadataHandler extends De
                 // Add the value, assuming it's not already there
                 List<String> previous = Arrays.asList(metadata.getValues(name));
                 if (!previous.contains(value)) {
-                    metadata.add(name, value);
+                    if (property != null) {
+                       metadata.add(property, value);
+                    } else {
+                       metadata.add(name, value);
+                    }
                 }
             } else {
                 // Set the value, assuming it's not already there
                 String previous = metadata.get(name);
                 if (previous != null && previous.length() > 0) {
                     if (!previous.equals(value)) {
-                        metadata.add(name, value);
+                       if (property != null) {
+                          metadata.add(property, value);
+                       } else {
+                          metadata.add(name, value);
+                       }
                     }
                 } else {
-                    metadata.set(name, value);
+                   if (property != null) {
+                      metadata.set(property, value);
+                   } else {
+                      metadata.set(name, value);
+                   }
                 }
             }
         }
     }
-
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java Thu May 17 16:58:57 2012
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.xml;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
@@ -38,6 +39,12 @@ public class AttributeMetadataHandler ex
         this.uri = uri;
         this.localName = localName;
     }
+    public AttributeMetadataHandler(
+          String uri, String localName, Metadata metadata, Property property) {
+      super(metadata, property);
+      this.uri = uri;
+      this.localName = localName;
+  }
 
     @Override
     public void startElement(

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java Thu May 17 16:58:57 2012
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.xml;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.Attributes;
 import org.xml.sax.helpers.DefaultHandler;
 
@@ -33,14 +34,21 @@ public class MetadataHandler extends Def
 
     private final Metadata metadata;
 
+    private final Property property;
     private final String name;
 
     private final StringBuilder buffer = new StringBuilder();
 
     public MetadataHandler(Metadata metadata, String name) {
         this.metadata = metadata;
+        this.property = null;
         this.name = name;
     }
+    public MetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
 
     public void addMetadata(String value) {
         if (value.length() > 0) {
@@ -48,7 +56,12 @@ public class MetadataHandler extends Def
             if (previous != null && previous.length() > 0) {
                 value = previous + ", " + value;
             }
-            metadata.set(name, value);
+            
+            if (this.property != null) {
+               metadata.set(property, value);
+            } else {
+               metadata.set(name, value);
+            }
         }
     }
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1339710&r1=1339709&r2=1339710&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Thu May 17 16:58:57 2012
@@ -20,6 +20,7 @@ import java.io.InputStream;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -85,6 +86,15 @@ public class ODFParserTest extends TikaT
                    metadata.get("generator"));
              
              // Check the document statistics
+             assertEquals("1", metadata.get(Office.PAGE_COUNT));
+             assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
+             assertEquals("14", metadata.get(Office.WORD_COUNT));
+             assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
+             assertEquals("0", metadata.get(Office.TABLE_COUNT));
+             assertEquals("0", metadata.get(Office.OBJECT_COUNT));
+             assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+             
+             // Check the Tika-1.0 style document statistics
              assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
              assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
              assertEquals("14", metadata.get(Metadata.WORD_COUNT));
@@ -93,7 +103,7 @@ public class ODFParserTest extends TikaT
              assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
              assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
              
-             // Check the old style statistics (these will be removed shortly)
+             // Check the very old style statistics (these will be removed shortly)
              assertEquals("0", metadata.get("nbTab"));
              assertEquals("0", metadata.get("nbObject"));
              assertEquals("0", metadata.get("nbImg"));
@@ -209,6 +219,15 @@ public class ODFParserTest extends TikaT
            assertEquals(null, metadata.get("custom:Info 4"));
            
            // Check the document statistics
+           assertEquals("2", metadata.get(Office.PAGE_COUNT));
+           assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
+           assertEquals("54", metadata.get(Office.WORD_COUNT));
+           assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
+           assertEquals("0", metadata.get(Office.TABLE_COUNT));
+           assertEquals("2", metadata.get(Office.OBJECT_COUNT));
+           assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+           
+           // Check the Tika-1.0 style document statistics
            assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
            assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
            assertEquals("54", metadata.get(Metadata.WORD_COUNT));