You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by lu...@apache.org on 2005/04/04 15:46:26 UTC

cvs commit: jakarta-slide/src/share/org/apache/slide/extractor OfficeExtractor.java

luetzkendorf    2005/04/04 06:46:26

  Modified:    src/share/org/apache/slide/extractor OfficeExtractor.java
  Log:
  reworked
   - labeled (userdefined) properties now can be extracted
   - extracted properties now can have namespaces
  
  Revision  Changes    Path
  1.5       +152 -24   jakarta-slide/src/share/org/apache/slide/extractor/OfficeExtractor.java
  
  Index: OfficeExtractor.java
  ===================================================================
  RCS file: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/OfficeExtractor.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- OfficeExtractor.java	14 Jan 2005 18:34:13 -0000	1.4
  +++ OfficeExtractor.java	4 Apr 2005 13:46:25 -0000	1.5
  @@ -1,7 +1,7 @@
   package org.apache.slide.extractor;
   
   import java.io.InputStream;
  -import java.util.ArrayList;
  +import java.util.Collections;
   import java.util.Enumeration;
   import java.util.HashMap;
   import java.util.Iterator;
  @@ -15,17 +15,86 @@
   import org.apache.poi.poifs.eventfilesystem.POIFSReader;
   import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
   import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
  +import org.apache.slide.common.PropertyName;
   import org.apache.slide.util.conf.Configurable;
   import org.apache.slide.util.conf.Configuration;
   import org.apache.slide.util.conf.ConfigurationException;
   
   /**
    * Property extractor for Microsoft office documents.
  + * 
  + * <p>This property extractor extracts properties from <code>SummaryInformation</code> and
  + * <code>DocumentSummaryInformation</code> headers of office documents.
  + * 
  + * <p>Sample configuration:
  + * <pre>
  + *   &lt;extractor classname="org.apache.slide.extractor.OfficeExtractor" uri="/files/docs/">
  + *     &lt;configuration>
  + *       &lt;instruction property="author" namespace="http://mycomp.com/namepsaces/webdav" summary-information="4" />
  + *       &lt;instruction property="application" namespace="http://mycomp.com/namepsaces/webdav" summary-information="18" />
  + *       &lt;instruction property="title" namespace="http://mycomp.com/namepsaces/webdav" summary-information="2" />
  + *       &lt;instruction property="category" namespace="http://mycomp.com/namepsaces/webdav" document-summary-information="2" />
  + *       &lt;instruction property="docid" namespace="http://mycomp.com/namepsaces/webdav" label="Document-ID" />
  + *     &lt;/configuration>
  + *   &lt;/extractor>
  + * </pre>
  + * The sample configuration
  + * <ul> 
  + * <li>maps the <em>author</em> info of office documents to the <code>author</code> 
  + * property. The author info can be found in the <code>SummaryInformation</code> header and
  + * has the <code>id</code> 4.
  + * <li>and maps the <em>category</em> entry of the <code>DocumentSummaryInformation</code> header, 
  + * which has the <code>id</code> 2 to the WebDAV property <code>category</code>.
  + * <li><code>SummaryInformation</code> headers can also contain "labled" entries, e.g. for user
  + * defined metadata. In the sample the labled entries with the label <code>Document-ID</code>
  + * will be mapped to the WebDAV-Property <code>docid</code>.
  + * </ul>
  + * All WebDAV properties in the sample will have the namespace 
  + * <code>http://mycomp.com/namepsaces/webdav</code>.
  + * 
  + * <p>The IDs in the <code>DocumentSummaryInformation</code> and <code>SummaryInformation</code>
  + * headers are somewhat mystical. Samples for <code>SummaryInformation</code> are:
  + * <pre>
  + *    1: codepage
  + *    2: title
  + *    3: theme
  + *    4: author
  + *    5: keywords
  + *    6: comments
  + *    7: template (e.g. Normal.dot"
  + *    8: last author
  + *    9: revision number
  + *   11: last printing date
  + *   12: creation date
  + *   13: last saved date
  + *   14: number of pages
  + *   15: number of words
  + *   16: number of characters
  + *   18: application name (e.g. "Microsoft Word 9.0")
  + *   19: 
  + * </pre>
  + * Samples for <code>DocumentSummaryInformation</code> are:
  + * <pre>
  + *    1: codepage
  + *    2: category
  + *    5: number of lines
  + *    6: number of paragraphs
  + *   14: manager
  + *   15: company
  + * </pre>
    */
   public class OfficeExtractor extends AbstractPropertyExtractor implements Configurable {
  -	protected List instructions = new ArrayList();
  -	protected Map propertyMap = new HashMap();
  -	static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV+","+MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV+","+MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
  +	// maps SummaryInformation IDs to PropertyNames 
  +	protected Map propertyMapSI = new HashMap();
  +	// maps DocumentSummaryInformation IDs to PropertyNames
  +	protected Map propertyMapDSI = new HashMap();
  +	// maps labled properties to PropertyNames
  +	protected Map propertyMapLbl = new HashMap();
  +	
  +	static final String CONTENT_TYPE_MS_OFFICE_ALL_CSV = 
  +		MSWordExtractor.CONTENT_TYPE_WORD_ALL_CSV + "," +
  +		MSExcelExtractor.CONTENT_TYPE_EXCEL_ALL_CSV + "," +
  +		MSPowerPointExtractor.CONTENT_TYPE_POWERPOINT_ALL_CSV;
   	
   	public OfficeExtractor(String uri, String contentType, String namespace) {
   		super(uri, contentType, namespace);
  @@ -38,17 +107,17 @@
   			r.registerListener(listener);
   			r.read(content);
   		} catch (Exception e) {
  -			throw new ExtractorException("Exception while extracting properties in OfficeExtractor");
  +			throw new ExtractorException("Exception while extracting properties in OfficeExtractor: " + e);
   		}
   		return listener.getProperties();
   	}
   
   	class OfficePropertiesListener implements POIFSReaderListener {
   
  -		private HashMap properties = new HashMap();
  +		private HashMap extractedProperties = new HashMap();
   
   		public Map getProperties() {
  -				return properties;
  +			return extractedProperties;
   		}
   
   		public void processPOIFSReaderEvent(POIFSReaderEvent event) {
  @@ -60,22 +129,46 @@
   			} catch (Exception ex) {
   				throw new RuntimeException("Property set stream \"" + event.getPath() + event.getName() + "\": " + ex);
   			}
  -			String eventName = event.getName().trim();
  -			final long sectionCount = ps.getSectionCount();
  +
  +			Map idMap = null;
  +			
  +			if (ps.isDocumentSummaryInformation()) {
  +				idMap = propertyMapDSI;
  +			} else if (ps.isSummaryInformation()) {
  +				idMap = propertyMapSI;
  +			} else {
  +				// can this happen?
  +				idMap = Collections.EMPTY_MAP;
  +			}
  +			
   			List sections = ps.getSections();
  -			int nr = 0;
  +
   			for (Iterator i = sections.iterator(); i.hasNext();) {
   				Section sec = (Section) i.next();
  -				int propertyCount = sec.getPropertyCount();
  -				Property[] props = sec.getProperties();
  -				for (int i2 = 0; i2 < props.length; i2++) {
  -					Property p = props[i2];
  -					int id = p.getID();
  -					long type = p.getType();
  -					Object value = p.getValue();
  -					String key = eventName + "-" + nr + "-" + id; 
  -					if ( propertyMap.containsKey(key) ) {
  -						properties.put(propertyMap.get(key), value);
  +				System.out.println("section: " + sec);
  +				
  +				if (sec.getProperty(0) == null) {
  +					for(Iterator j = idMap.entrySet().iterator(); j.hasNext();) {
  +						Map.Entry e = (Map.Entry)j.next();
  +						
  +						Object propertyValue = sec.getProperty(((Integer)e.getKey()).intValue());
  +						if (propertyValue != null) {
  +							//System.out.println("\t" + e.getValue() + "=" + propertyValue);
  +							extractedProperties.put(e.getValue(), propertyValue);
  +						}
  +					}
  +				} else {
  +					Map dict = (Map)sec.getProperty(0);
  +					// this section has a dictionary
  +					Property property[] = sec.getProperties();
  +					for(int j = 0; j < property.length; j++) {
  +						//String label = sec.getPIDString(property[j].getID()); TODO why doesn't this work
  +						String label = (String)dict.get(new Long(property[j].getID()));
  +						PropertyName slideProperty = (PropertyName)propertyMapLbl.get(label);
  +						if (slideProperty != null) {
  +							//System.out.println("\t" + slideProperty + "=" + property[j].getValue());
  +							extractedProperties.put(slideProperty, property[j].getValue());
  +						}
   					}
   				}
   			}
  @@ -85,10 +178,45 @@
   	public void configure(Configuration configuration) throws ConfigurationException {
           Enumeration instructions = configuration.getConfigurations("instruction");
           while (instructions.hasMoreElements()) {
  -            Configuration extract = (Configuration)instructions.nextElement();
  -            String property = extract.getAttribute("property");
  -            String id = extract.getAttribute("id");
  -			propertyMap.put(id, property);
  +            Configuration instruction = (Configuration)instructions.nextElement();
  +            PropertyName propertyName = PropertyName.getPropertyName(
  +            		instruction.getAttribute("property"),
  +					instruction.getAttribute("namespace", "DAV:"));
  +            
  +            try {
  +	            String id = instruction.getAttribute("summary-information", null);
  +	            if (id != null) {
  +	            	this.propertyMapSI.put(Integer.valueOf(id), propertyName);
  +	            	continue;
  +	            }
  +	            
  +	            id = instruction.getAttribute("document-summary-information", null);
  +	            if (id != null) {
  +	            	this.propertyMapDSI.put(Integer.valueOf(id), propertyName);
  +	            	continue;
  +	            }
  +	            
  +	            id = instruction.getAttribute("label", null);
  +	            if (id != null) {
  +	            	this.propertyMapLbl.put(id, propertyName);
  +	            	continue;
  +	            }
  +	            
  +	            // for backward compatibility
  +	            // old style id atributes like SummaryInformation-0-4
  +	            id = instruction.getAttribute("id", null);
  +	            if (id != null) {
  +	            	Integer intId = Integer.valueOf(id.substring(id.lastIndexOf('-')+1));
  +	            	if (id.startsWith("SummaryInformation")) {
  +	            		this.propertyMapSI.put(intId, propertyName);
  +	            	}
  +	            	if (id.startsWith("DocumentSummaryInformation")) {
  +	            		this.propertyMapDSI.put(intId, propertyName);
  +	            	}
  +	            }
  +            } catch(NumberFormatException e) {
  +            	throw new ConfigurationException("Invalid instruction: " + e, instruction);
  +            }
           }
   	}
   	
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: slide-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: slide-dev-help@jakarta.apache.org