You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/10 18:08:25 UTC

svn commit: r376768 - in /lucene/nutch/trunk: ./ src/plugin/ src/plugin/parse-msexcel/ src/plugin/parse-msexcel/sample/ src/plugin/parse-msexcel/src/ src/plugin/parse-msexcel/src/java/ src/plugin/parse-msexcel/src/java/org/ src/plugin/parse-msexcel/src...

Author: jerome
Date: Fri Feb 10 09:08:23 2006
New Revision: 376768

URL: http://svn.apache.org/viewcvs?rev=376768&view=rev
Log:
NUTCH-52, Add a parser plugin for MS Excel files

Added:
    lucene/nutch/trunk/src/plugin/parse-msexcel/
    lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/sample/
    lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java   (with props)
Modified:
    lucene/nutch/trunk/build.xml
    lucene/nutch/trunk/default.properties
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb 10 09:08:23 2006
@@ -254,6 +254,7 @@
     	<packageset dir="${plugins.dir}/parse-pdf/src/java"/>
 <!--	<packageset dir="${plugins.dir}/parse-rtf/src/java"/> plugin excluded from build due to licensing issues-->
 <!--	<packageset dir="${plugins.dir}/parse-mp3/src/java"/> plugin excluded from build due to licensing issues-->
+    	<packageset dir="${plugins.dir}/parse-msexcel/src/java"/>
     	<packageset dir="${plugins.dir}/parse-mspowerpoint/src/java"/>
     	<packageset dir="${plugins.dir}/parse-msword/src/java"/>
     	<packageset dir="${plugins.dir}/parse-rss/src/java"/>

Modified: lucene/nutch/trunk/default.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Fri Feb 10 09:08:23 2006
@@ -63,6 +63,7 @@
 plugin.libhttp=org.apache.nutch.protocol.http.api*
 plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more*
 plugin.mp3=org.apache.nutch.parse.mp3*
+plugin.msexcel=org.apache.nutch.parse.msexcel*
 plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint*
 plugin.msword=org.apache.nutch.parse.msword*
 # Unfortunately, ontology on core and plugin uses the same package:
@@ -91,6 +92,7 @@
    ${plugin.libhttp}:\
    ${plugin.more}:\
    ${plugin.mp3}:\
+   ${plugin.msexcel}:\
    ${plugin.mspowerpoint}:\
    ${plugin.msword}:\
    ${plugin.pdf}:\

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Feb 10 09:08:23 2006
@@ -24,6 +24,7 @@
      <ant dir="parse-html" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <!-- <ant dir="parse-mp3" target="deploy"/> -->
+     <ant dir="parse-msexcel" target="deploy"/>
      <ant dir="parse-mspowerpoint" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
@@ -52,6 +53,7 @@
      <ant dir="parse-ext" target="test"/>
      <ant dir="parse-html" target="test"/>
      <!-- <ant dir="parse-mp3" target="test"/> -->
+     <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>
      <ant dir="parse-msword" target="test"/>
      <ant dir="parse-pdf" target="test"/>
@@ -86,6 +88,7 @@
     <ant dir="parse-html" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-mp3" target="clean"/>
+    <ant dir="parse-msexcel" target="clean"/>
     <ant dir="parse-mspowerpoint" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Fri Feb 10 09:08:23 2006
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+
+<project name="parse-msexcel" default="jar">
+
+	<import file="../build-plugin.xml" />
+
+  <path id="plugin.deps">
+    <fileset dir="../lib-jakarta-poi/lib">
+      <include name="*.jar" />
+    </fileset>
+  </path>
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.xls" />
+		</fileset>
+	</copy>
+</project>

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Fri Feb 10 09:08:23 2006
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-msexcel"
+   name="MSExcel Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-msexcel.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+     <import plugin="nutch-extensionpoints"/>
+     <import plugin="lib-jakarta-poi"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.msexcel"
+              name="MSExcelParser" 
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.msexcel.MSExcelParser"
+                      class="org.apache.nutch.parse.msexcel.MSExcelParser" 
+                      contentType="application/vnd.ms-excel"
+                      pathSuffix="xls"/>
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls?rev=376768&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content Fri Feb 10 09:08:23 2006
@@ -0,0 +1,3 @@
+BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! 
+
+BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!!
\ No newline at end of file

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+
+// Jakarta POI imports
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Excel Text and Properties extractor.
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class ExcelExtractor {
+
+  
+  public String extractText(InputStream input) throws IOException {
+    
+    String resultText = "";
+    HSSFWorkbook wb = new HSSFWorkbook(input);
+    if (wb == null) {
+      return resultText;
+    }
+    
+    HSSFSheet sheet;
+    HSSFRow row;
+    HSSFCell cell;
+    int sNum = 0;
+    int rNum = 0;
+    int cNum = 0;
+    
+    sNum = wb.getNumberOfSheets();
+    
+    for (int i=0; i<sNum; i++) {
+      if ((sheet = wb.getSheetAt(i)) == null) {
+        continue;
+      }
+      rNum = sheet.getLastRowNum();
+      for (int j=0; j<=rNum; j++) {
+        if ((row = sheet.getRow(j)) == null){
+          continue;
+        }
+        cNum = row.getLastCellNum();
+        
+        for (int k=0; k<cNum; k++) {
+          if ((cell = row.getCell((short) k)) != null) {
+            /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
+                resultText += cell.getDateCellValue().toString() + " ";
+              } else
+             */
+            if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
+              resultText += cell.getStringCellValue() + " ";
+            } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
+              Double d = new Double(cell.getNumericCellValue());
+              resultText += d.toString() + " ";
+            }
+            /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
+                 resultText += cell.getCellFormula() + " ";
+               } 
+             */
+          }
+        }
+      }
+    }
+    return resultText;
+  }
+  
+  
+  public Properties extractProperties(InputStream input) throws IOException {
+    
+    PropertiesBroker propertiesBroker = new PropertiesBroker();
+    POIFSReader reader = new POIFSReader();
+    reader.registerListener(new PropertiesReaderListener(propertiesBroker),
+                            "\005SummaryInformation");
+    reader.read(input);
+    return propertiesBroker.getProperties();
+  }
+  
+  
+  class PropertiesBroker {
+    
+    private Properties properties;
+    private int timeoutMillis = 2 * 1000;
+    
+    
+    public synchronized Properties getProperties() {
+      
+      long start = new Date().getTime();
+      long now = start;
+      
+      while ((properties == null) && (now-start < timeoutMillis)) {
+        try {
+          wait(timeoutMillis / 10);
+        } catch (InterruptedException e) {}
+        now = new Date().getTime();
+      }
+      
+      notifyAll();
+      return properties;
+    }
+    
+    public synchronized void setProperties(Properties properties) {
+      this.properties = properties;
+      notifyAll();
+    }
+  }
+
+}
+

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * An Excel document parser.
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class MSExcelParser implements Parser {
+  
+  private Configuration conf;
+  
+  private static final Logger LOG = LogFormatter.getLogger(MSExcelParser.class.getName());
+
+  /** Creates a new instance of MSExcelParser */
+  public MSExcelParser() { }
+  
+  public Parse getParse(Content content) {
+    
+    String text = null;
+    String title = null;
+    Properties properties = null;
+    
+    try {
+      byte[] raw = content.getContent();
+      String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
+      if ((contentLength != null) &&
+          (raw.length != Integer.parseInt(contentLength))) {
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_TRUNCATED,
+                               "Content truncated at " + raw.length +" bytes. " +
+                               "Parser can't handle incomplete msexcelfile.")
+                               .getEmptyParse(this.conf);
+      }
+
+      ExcelExtractor extractor = new ExcelExtractor();      
+      // Extract text
+      text = extractor.extractText(new ByteArrayInputStream(raw));
+      // Extract properties
+      properties = extractor.extractProperties(new ByteArrayInputStream(raw));
+      
+      //currently returning empty outlinks array
+      //outlinks = this.fetchOutlinks(resultText);
+      
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+                             "Can't be handled as msexcel document. " + e)
+                             .getEmptyParse(this.conf);
+    } finally {
+      // nothing so far
+    }
+    
+    // collect meta data
+    Metadata metadata = new Metadata();
+    title = properties.getProperty(DublinCore.TITLE);
+    properties.remove(DublinCore.TITLE);
+    metadata.setAll(properties);
+
+    if (text == null) { text = ""; }
+    if (title == null) { title = ""; }
+
+    // collect outlink
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
+
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+                                        outlinks, content.getMetadata(),
+                                        metadata);
+    parseData.setConf(this.conf);
+    return new ParseImpl(text, parseData);
+  }
+
+
+  /* ---------------------------- *
+   * <implemenation:Configurable> *
+   * ---------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /* ----------------------------- *
+   * </implemenation:Configurable> *
+   * ----------------------------- */
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.util.Date;
+import java.util.Properties;
+
+// Jakarta POI imports
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.msexcel.ExcelExtractor.PropertiesBroker;
+
+
+/**
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class PropertiesReaderListener implements POIFSReaderListener {
+    
+    private PropertiesBroker propertiesBroker;
+    private Properties metaData = new Properties();
+
+    public PropertiesReaderListener(PropertiesBroker propertiesBroker) {
+      this.propertiesBroker = propertiesBroker;
+    }
+
+    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+
+      SummaryInformation si = null;
+      Properties properties = new Properties();
+
+      try {
+        si = (SummaryInformation)PropertySetFactory.create(event.getStream());
+      } catch (Exception ex) {
+        properties = null;
+      }
+
+      Date tmp = null;
+
+      String title = si.getTitle();
+      String applicationName = si.getApplicationName();
+      String author = si.getAuthor();
+      int charCount = si.getCharCount();
+      String comments = si.getComments();
+      Date createDateTime = si.getCreateDateTime();
+      long editTime = si.getEditTime();
+      String keywords = si.getKeywords();
+      String lastAuthor = si.getLastAuthor();
+      Date lastPrinted = si.getLastPrinted();
+      Date lastSaveDateTime = si.getLastSaveDateTime();
+      int pageCount = si.getPageCount();
+      String revNumber = si.getRevNumber();
+      int security = si.getSecurity();
+      String subject = si.getSubject();
+      String template = si.getTemplate();
+      int wordCount = si.getWordCount();
+
+      /*Dates are being stored in millis since the epoch to aid
+      localization*/
+      if(title != null)
+        properties.setProperty(Metadata.TITLE, title);
+      if(applicationName != null)
+        properties.setProperty(Metadata.APPLICATION_NAME, applicationName);
+      if(author != null)
+        properties.setProperty(Metadata.AUTHOR, author);
+      if(charCount != 0)
+        properties.setProperty(Metadata.CHARACTER_COUNT, charCount + "");
+      if(comments != null)
+        properties.setProperty(Metadata.COMMENTS, comments);
+      if(createDateTime != null)
+        properties.setProperty(Metadata.DATE,
+                               Metadata.DATE_FORMAT.format(createDateTime));
+      if(editTime != 0)
+        properties.setProperty(Metadata.LAST_MODIFIED, editTime + "");
+      if(keywords != null)
+        properties.setProperty(Metadata.KEYWORDS, keywords);
+      if(lastAuthor != null)
+        properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor);
+      if(lastPrinted != null)
+        properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + "");
+      if(lastSaveDateTime != null)
+        properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() + "");
+      if(pageCount != 0)
+        properties.setProperty(Metadata.PAGE_COUNT, pageCount + "");
+      if(revNumber != null)
+        properties.setProperty(Metadata.REVISION_NUMBER, revNumber);
+      if(security != 0)
+        properties.setProperty(Metadata.RIGHTS, security + "");
+      if(subject != null)
+        properties.setProperty(Metadata.SUBJECT, subject);
+      if(template != null)
+        properties.setProperty(Metadata.TEMPLATE, template);
+      if(wordCount != 0)
+        properties.setProperty(Metadata.WORD_COUNT, wordCount + "");
+      propertiesBroker.setProperties(properties);
+    }
+    
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html Fri Feb 10 09:08:23 2006
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>An Excel document parsing plugin.</p>
+<p>This package relies on Jakarta <a href="http://jakarta.apache.org/poi/index.html">POI</a>.</p>
+</body>
+</html>

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,64 @@
+/*
+ *  TestMSExcelParser.java 
+ *  Based on the Unit Tests for MSWordParser by John Xing
+ */
+package org.apache.nutch.parse.msexcel;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+
+import junit.framework.TestCase;
+
+/** 
+ * Based on Unit tests for MSWordParser by John Xing
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestMSExcelParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  
+  // Make sure sample files are copied to "test.data"
+  
+  private String[] sampleFiles = {"test.xls"};
+
+  private String expectedText = "BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! ";
+
+  public TestMSExcelParser(String name) { 
+    super(name); 
+  }
+
+  protected void setUp() {}
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parser parser;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = ProtocolFactory.getProtocol(urlString);
+      content = protocol.getContent(urlString);
+
+      parser = ParserFactory.getParser(content.getContentType(), urlString);
+      parse = parser.getParse(content);
+
+      assertTrue(parse.getText().equals(expectedText));
+    }
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
------------------------------------------------------------------------------
    svn:eol-style = native