You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by lu...@apache.org on 2005/04/04 15:46:26 UTC
cvs commit: jakarta-slide/src/share/org/apache/slide/extractor OfficeExtractor.java
luetzkendorf 2005/04/04 06:46:26
Modified: src/share/org/apache/slide/extractor OfficeExtractor.java
Log:
reworked
- labeled (userdefined) properties now can be extracted
- extracted properties now can have namespaces
Revision Changes Path
1.5 +152 -24 jakarta-slide/src/share/org/apache/slide/extractor/OfficeExtractor.java
Index: OfficeExtractor.java
===================================================================
RCS file: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/OfficeExtractor.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- OfficeExtractor.java 14 Jan 2005 18:34:13 -0000 1.4
+++ OfficeExtractor.java 4 Apr 2005 13:46:25 -0000 1.5
@@ -1,7 +1,7 @@
package org.apache.slide.extractor;
import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
@@ -15,17 +15,86 @@
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.slide.common.PropertyName;
import org.apache.slide.util.conf.Configurable;
import org.apache.slide.util.conf.Configuration;
import org.apache.slide.util.conf.ConfigurationException;
/**
* Property extractor for Microsoft office documents.
+ *
+ * <p>This property extractor extracts properties from <code>SummaryInformation</code> and
+ * <code>DocumentSummaryInformation</code> headers of office documents.
+ *
+ * <p>Sample configuration:
+ * <pre>
+ * <extractor classname="org.apache.slide.extractor.OfficeExtractor" uri="/files/docs/">
+ * <configuration>
+ * <instruction property="author" namespace="http://mycomp.com/namepsaces/webdav" summary-information="4" />
+ * <instruction property="application" namespace="http://mycomp.com/namepsaces/webdav" summary-information="18" />
+ * <instruction property="title" namespace="http://mycomp.com/namepsaces/webdav" summary-information="2" />
+ * <instruction property="category" namespace="http://mycomp.com/namepsaces/webdav" document-summary-information="2" />
+ * <instruction property="docid" namespace="http://mycomp.com/namepsaces/webdav" label="Document-ID" />
+ * </configuration>
+ * </extractor>
+ * </pre>
+ * The sample configuration
+ * <ul>
+ * <li>maps the <em>author</em> info of office documents to the <code>author</code>
+ * property. The author info can be found in the <code>SummaryInformation</code> header and
+ * has the <code>id</code> 4.
+ * <li>and maps the <em>category</em> entry of the <code>DocumentSummaryInformation</code> header,
+ * which has the <code>id</code> 2 to the WebDAV property <code>category</code>.
+ * <li><code>SummaryInformation</code> headers can also contain "labled" entries, e.g. for user
+ * defined metadata. In the sample the labled entries with the label <code>Document-ID</code>
+ * will be mapped to the WebDAV-Property <code>docid</code>.
+ * </ul>
+ * All WebDAV properties in the sample will have the namespace
+ * <code>http://mycomp.com/namepsaces/webdav</code>.
+ *
+ * <p>The IDs in the <code>DocumentSummaryInformation</code> and <code>SummaryInformation</code>
+ * headers are somewhat mystical. Samples for <code>SummaryInformation</code> are:
+ * <pre>
+ * 1: codepage
+ * 2: title
+ * 3: theme
+ * 4: author
+ * 5: keywords
+ * 6: comments
+ * 7: template (e.g. Normal.dot"
+ * 8: last author
+ * 9: revision number
+ * 11: last printing date
+ * 12: creation date
+ * 13: last saved date
+ * 14: number of pages
+ * 15: number of words
+ * 16: number of characters
+ * 18: application name (e.g. "Microsoft Word 9.0")
+ * 19:
+ * </pre>
+ * Samples for <code>DocumentSummaryInformation</code> are:
+ * <pre>
+ * 1: codepage
+ * 2: category
+ * 5: number of lines
+ * 6: number of paragraphs
+ * 14: manager
+ * 15: company
+ * </pre>
*/
public class OfficeExtractor extends AbstractPropertyExtractor implements Configurable {
- protected List instructions = new ArrayList();
- protected Map propertyMap = new HashMap();
- static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV+","+MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV+","+MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
+ // maps SummaryInformation IDs to PropertyNames
+ protected Map propertyMapSI = new HashMap();
+ // maps DocumentSummaryInformation IDs to PropertyNames
+ protected Map propertyMapDSI = new HashMap();
+ // maps labled properties to PropertyNames
+ protected Map propertyMapLbl = new HashMap();
+
+ static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV =
+ MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV + "," +
+ MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV + "," +
+ MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
public OfficeExtractor(String uri, String contentType, String namespace) {
super(uri, contentType, namespace);
@@ -38,17 +107,17 @@
r.registerListener(listener);
r.read(content);
} catch (Exception e) {
- throw new ExtractorException("Exception while extracting properties in OfficeExtractor");
+ throw new ExtractorException("Exception while extracting properties in OfficeExtractor: " + e);
}
return listener.getProperties();
}
class OfficePropertiesListener implements POIFSReaderListener {
- private HashMap properties = new HashMap();
+ private HashMap extractedProperties = new HashMap();
public Map getProperties() {
- return properties;
+ return extractedProperties;
}
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
@@ -60,22 +129,46 @@
} catch (Exception ex) {
throw new RuntimeException("Property set stream \"" + event.getPath() + event.getName() + "\": " + ex);
}
- String eventName = event.getName().trim();
- final long sectionCount = ps.getSectionCount();
+
+ Map idMap = null;
+
+ if (ps.isDocumentSummaryInformation()) {
+ idMap = propertyMapDSI;
+ } else if (ps.isSummaryInformation()) {
+ idMap = propertyMapSI;
+ } else {
+ // can this happen?
+ idMap = Collections.EMPTY_MAP;
+ }
+
List sections = ps.getSections();
- int nr = 0;
+
for (Iterator i = sections.iterator(); i.hasNext();) {
Section sec = (Section) i.next();
- int propertyCount = sec.getPropertyCount();
- Property[] props = sec.getProperties();
- for (int i2 = 0; i2 < props.length; i2++) {
- Property p = props[i2];
- int id = p.getID();
- long type = p.getType();
- Object value = p.getValue();
- String key = eventName + "-" + nr + "-" + id;
- if ( propertyMap.containsKey(key) ) {
- properties.put(propertyMap.get(key), value);
+ System.out.println("section: " + sec);
+
+ if (sec.getProperty(0) == null) {
+ for(Iterator j = idMap.entrySet().iterator(); j.hasNext();) {
+ Map.Entry e = (Map.Entry)j.next();
+
+ Object propertyValue = sec.getProperty(((Integer)e.getKey()).intValue());
+ if (propertyValue != null) {
+ //System.out.println("\t" + e.getValue() + "=" + propertyValue);
+ extractedProperties.put(e.getValue(), propertyValue);
+ }
+ }
+ } else {
+ Map dict = (Map)sec.getProperty(0);
+ // this section has a dictionary
+ Property property[] = sec.getProperties();
+ for(int j = 0; j < property.length; j++) {
+ //String label = sec.getPIDString(property[j].getID()); TODO why doesn't this work
+ String label = (String)dict.get(new Long(property[j].getID()));
+ PropertyName slideProperty = (PropertyName)propertyMapLbl.get(label);
+ if (slideProperty != null) {
+ //System.out.println("\t" + slideProperty + "=" + property[j].getValue());
+ extractedProperties.put(slideProperty, property[j].getValue());
+ }
}
}
}
@@ -85,10 +178,45 @@
public void configure(Configuration configuration) throws ConfigurationException {
Enumeration instructions = configuration.getConfigurations("instruction");
while (instructions.hasMoreElements()) {
- Configuration extract = (Configuration)instructions.nextElement();
- String property = extract.getAttribute("property");
- String id = extract.getAttribute("id");
- propertyMap.put(id, property);
+ Configuration instruction = (Configuration)instructions.nextElement();
+ PropertyName propertyName = PropertyName.getPropertyName(
+ instruction.getAttribute("property"),
+ instruction.getAttribute("namespace", "DAV:"));
+
+ try {
+ String id = instruction.getAttribute("summary-information", null);
+ if (id != null) {
+ this.propertyMapSI.put(Integer.valueOf(id), propertyName);
+ continue;
+ }
+
+ id = instruction.getAttribute("document-summary-information", null);
+ if (id != null) {
+ this.propertyMapDSI.put(Integer.valueOf(id), propertyName);
+ continue;
+ }
+
+ id = instruction.getAttribute("label", null);
+ if (id != null) {
+ this.propertyMapLbl.put(id, propertyName);
+ continue;
+ }
+
+ // for backward compatibility
+ // old style id atributes like SummaryInformation-0-4
+ id = instruction.getAttribute("id", null);
+ if (id != null) {
+ Integer intId = Integer.valueOf(id.substring(id.lastIndexOf('-')+1));
+ if (id.startsWith("SummaryInformation")) {
+ this.propertyMapSI.put(intId, propertyName);
+ }
+ if (id.startsWith("DocumentSummaryInformation")) {
+ this.propertyMapDSI.put(intId, propertyName);
+ }
+ }
+ } catch(NumberFormatException e) {
+ throw new ConfigurationException("Invalid instruction: " + e, instruction);
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: slide-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: slide-dev-help@jakarta.apache.org