You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/10 18:08:25 UTC
svn commit: r376768 - in /lucene/nutch/trunk: ./ src/plugin/
src/plugin/parse-msexcel/ src/plugin/parse-msexcel/sample/
src/plugin/parse-msexcel/src/ src/plugin/parse-msexcel/src/java/
src/plugin/parse-msexcel/src/java/org/ src/plugin/parse-msexcel/src...
Author: jerome
Date: Fri Feb 10 09:08:23 2006
New Revision: 376768
URL: http://svn.apache.org/viewcvs?rev=376768&view=rev
Log:
NUTCH-52, Add a parser plugin for MS Excel files
Added:
lucene/nutch/trunk/src/plugin/parse-msexcel/
lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/sample/
lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content
lucene/nutch/trunk/src/plugin/parse-msexcel/src/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html (with props)
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (with props)
Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/default.properties
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb 10 09:08:23 2006
@@ -254,6 +254,7 @@
<packageset dir="${plugins.dir}/parse-pdf/src/java"/>
<!-- <packageset dir="${plugins.dir}/parse-rtf/src/java"/> plugin excluded from build due to licensing issues-->
<!-- <packageset dir="${plugins.dir}/parse-mp3/src/java"/> plugin excluded from build due to licensing issues-->
+ <packageset dir="${plugins.dir}/parse-msexcel/src/java"/>
<packageset dir="${plugins.dir}/parse-mspowerpoint/src/java"/>
<packageset dir="${plugins.dir}/parse-msword/src/java"/>
<packageset dir="${plugins.dir}/parse-rss/src/java"/>
Modified: lucene/nutch/trunk/default.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Fri Feb 10 09:08:23 2006
@@ -63,6 +63,7 @@
plugin.libhttp=org.apache.nutch.protocol.http.api*
plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more*
plugin.mp3=org.apache.nutch.parse.mp3*
+plugin.msexcel=org.apache.nutch.parse.msexcel*
plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint*
plugin.msword=org.apache.nutch.parse.msword*
# Unfortunately, ontology on core and plugin uses the same package:
@@ -91,6 +92,7 @@
${plugin.libhttp}:\
${plugin.more}:\
${plugin.mp3}:\
+ ${plugin.msexcel}:\
${plugin.mspowerpoint}:\
${plugin.msword}:\
${plugin.pdf}:\
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Feb 10 09:08:23 2006
@@ -24,6 +24,7 @@
<ant dir="parse-html" target="deploy"/>
<ant dir="parse-js" target="deploy"/>
<!-- <ant dir="parse-mp3" target="deploy"/> -->
+ <ant dir="parse-msexcel" target="deploy"/>
<ant dir="parse-mspowerpoint" target="deploy"/>
<ant dir="parse-msword" target="deploy"/>
<ant dir="parse-pdf" target="deploy"/>
@@ -52,6 +53,7 @@
<ant dir="parse-ext" target="test"/>
<ant dir="parse-html" target="test"/>
<!-- <ant dir="parse-mp3" target="test"/> -->
+ <ant dir="parse-msexcel" target="test"/>
<ant dir="parse-mspowerpoint" target="test"/>
<ant dir="parse-msword" target="test"/>
<ant dir="parse-pdf" target="test"/>
@@ -86,6 +88,7 @@
<ant dir="parse-html" target="clean"/>
<ant dir="parse-js" target="clean"/>
<ant dir="parse-mp3" target="clean"/>
+ <ant dir="parse-msexcel" target="clean"/>
<ant dir="parse-mspowerpoint" target="clean"/>
<ant dir="parse-msword" target="clean"/>
<ant dir="parse-pdf" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Fri Feb 10 09:08:23 2006
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+
+<project name="parse-msexcel" default="jar">
+
+ <import file="../build-plugin.xml" />
+
+ <path id="plugin.deps">
+ <fileset dir="../lib-jakarta-poi/lib">
+ <include name="*.jar" />
+ </fileset>
+ </path>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.xls" />
+ </fileset>
+ </copy>
+</project>
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Fri Feb 10 09:08:23 2006
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-msexcel"
+ name="MSExcel Parse Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parse-msexcel.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-jakarta-poi"/>
+ </requires>
+
+ <extension id="org.apache.nutch.parse.msexcel"
+ name="MSExcelParser"
+ point="org.apache.nutch.parse.Parser">
+
+ <implementation id="org.apache.nutch.parse.msexcel.MSExcelParser"
+ class="org.apache.nutch.parse.msexcel.MSExcelParser"
+ contentType="application/vnd.ms-excel"
+ pathSuffix="xls"/>
+ </extension>
+
+</plugin>
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls?rev=376768&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content Fri Feb 10 09:08:23 2006
@@ -0,0 +1,3 @@
+BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!!
+
+BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!!
\ No newline at end of file
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+
+// Jakarta POI imports
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Excel Text and Properties extractor.
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author Jérôme Charron
+ */
+public class ExcelExtractor {
+
+
+ public String extractText(InputStream input) throws IOException {
+
+ String resultText = "";
+ HSSFWorkbook wb = new HSSFWorkbook(input);
+ if (wb == null) {
+ return resultText;
+ }
+
+ HSSFSheet sheet;
+ HSSFRow row;
+ HSSFCell cell;
+ int sNum = 0;
+ int rNum = 0;
+ int cNum = 0;
+
+ sNum = wb.getNumberOfSheets();
+
+ for (int i=0; i<sNum; i++) {
+ if ((sheet = wb.getSheetAt(i)) == null) {
+ continue;
+ }
+ rNum = sheet.getLastRowNum();
+ for (int j=0; j<=rNum; j++) {
+ if ((row = sheet.getRow(j)) == null){
+ continue;
+ }
+ cNum = row.getLastCellNum();
+
+ for (int k=0; k<cNum; k++) {
+ if ((cell = row.getCell((short) k)) != null) {
+ /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
+ resultText += cell.getDateCellValue().toString() + " ";
+ } else
+ */
+ if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
+ resultText += cell.getStringCellValue() + " ";
+ } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
+ Double d = new Double(cell.getNumericCellValue());
+ resultText += d.toString() + " ";
+ }
+ /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
+ resultText += cell.getCellFormula() + " ";
+ }
+ */
+ }
+ }
+ }
+ }
+ return resultText;
+ }
+
+
+ public Properties extractProperties(InputStream input) throws IOException {
+
+ PropertiesBroker propertiesBroker = new PropertiesBroker();
+ POIFSReader reader = new POIFSReader();
+ reader.registerListener(new PropertiesReaderListener(propertiesBroker),
+ "\005SummaryInformation");
+ reader.read(input);
+ return propertiesBroker.getProperties();
+ }
+
+
+ class PropertiesBroker {
+
+ private Properties properties;
+ private int timeoutMillis = 2 * 1000;
+
+
+ public synchronized Properties getProperties() {
+
+ long start = new Date().getTime();
+ long now = start;
+
+ while ((properties == null) && (now-start < timeoutMillis)) {
+ try {
+ wait(timeoutMillis / 10);
+ } catch (InterruptedException e) {}
+ now = new Date().getTime();
+ }
+
+ notifyAll();
+ return properties;
+ }
+
+ public synchronized void setProperties(Properties properties) {
+ this.properties = properties;
+ notifyAll();
+ }
+ }
+
+}
+
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * An Excel document parser.
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author Jérôme Charron
+ */
+public class MSExcelParser implements Parser {
+
+ private Configuration conf;
+
+ private static final Logger LOG = LogFormatter.getLogger(MSExcelParser.class.getName());
+
+ /** Creates a new instance of MSExcelParser */
+ public MSExcelParser() { }
+
+ public Parse getParse(Content content) {
+
+ String text = null;
+ String title = null;
+ Properties properties = null;
+
+ try {
+ byte[] raw = content.getContent();
+ String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
+ if ((contentLength != null) &&
+ (raw.length != Integer.parseInt(contentLength))) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at " + raw.length +" bytes. " +
+ "Parser can't handle incomplete msexcelfile.")
+ .getEmptyParse(this.conf);
+ }
+
+ ExcelExtractor extractor = new ExcelExtractor();
+ // Extract text
+ text = extractor.extractText(new ByteArrayInputStream(raw));
+ // Extract properties
+ properties = extractor.extractProperties(new ByteArrayInputStream(raw));
+
+ //currently returning empty outlinks array
+ //outlinks = this.fetchOutlinks(resultText);
+
+ } catch (Exception e) {
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as msexcel document. " + e)
+ .getEmptyParse(this.conf);
+ } finally {
+ // nothing so far
+ }
+
+ // collect meta data
+ Metadata metadata = new Metadata();
+ title = properties.getProperty(DublinCore.TITLE);
+ properties.remove(DublinCore.TITLE);
+ metadata.setAll(properties);
+
+ if (text == null) { text = ""; }
+ if (title == null) { title = ""; }
+
+ // collect outlink
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
+
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+ outlinks, content.getMetadata(),
+ metadata);
+ parseData.setConf(this.conf);
+ return new ParseImpl(text, parseData);
+ }
+
+
+ /* ---------------------------- *
+ * <implemenation:Configurable> *
+ * ---------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ----------------------------- *
+ * </implemenation:Configurable> *
+ * ----------------------------- */
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.util.Date;
+import java.util.Properties;
+
+// Jakarta POI imports
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.msexcel.ExcelExtractor.PropertiesBroker;
+
+
+/**
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author Jérôme Charron
+ */
+public class PropertiesReaderListener implements POIFSReaderListener {
+
+ private PropertiesBroker propertiesBroker;
+ private Properties metaData = new Properties();
+
+ public PropertiesReaderListener(PropertiesBroker propertiesBroker) {
+ this.propertiesBroker = propertiesBroker;
+ }
+
+ public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+
+ SummaryInformation si = null;
+ Properties properties = new Properties();
+
+ try {
+ si = (SummaryInformation)PropertySetFactory.create(event.getStream());
+ } catch (Exception ex) {
+ properties = null;
+ }
+
+ Date tmp = null;
+
+ String title = si.getTitle();
+ String applicationName = si.getApplicationName();
+ String author = si.getAuthor();
+ int charCount = si.getCharCount();
+ String comments = si.getComments();
+ Date createDateTime = si.getCreateDateTime();
+ long editTime = si.getEditTime();
+ String keywords = si.getKeywords();
+ String lastAuthor = si.getLastAuthor();
+ Date lastPrinted = si.getLastPrinted();
+ Date lastSaveDateTime = si.getLastSaveDateTime();
+ int pageCount = si.getPageCount();
+ String revNumber = si.getRevNumber();
+ int security = si.getSecurity();
+ String subject = si.getSubject();
+ String template = si.getTemplate();
+ int wordCount = si.getWordCount();
+
+ /*Dates are being stored in millis since the epoch to aid
+ localization*/
+ if(title != null)
+ properties.setProperty(Metadata.TITLE, title);
+ if(applicationName != null)
+ properties.setProperty(Metadata.APPLICATION_NAME, applicationName);
+ if(author != null)
+ properties.setProperty(Metadata.AUTHOR, author);
+ if(charCount != 0)
+ properties.setProperty(Metadata.CHARACTER_COUNT, charCount + "");
+ if(comments != null)
+ properties.setProperty(Metadata.COMMENTS, comments);
+ if(createDateTime != null)
+ properties.setProperty(Metadata.DATE,
+ Metadata.DATE_FORMAT.format(createDateTime));
+ if(editTime != 0)
+ properties.setProperty(Metadata.LAST_MODIFIED, editTime + "");
+ if(keywords != null)
+ properties.setProperty(Metadata.KEYWORDS, keywords);
+ if(lastAuthor != null)
+ properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor);
+ if(lastPrinted != null)
+ properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + "");
+ if(lastSaveDateTime != null)
+ properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() + "");
+ if(pageCount != 0)
+ properties.setProperty(Metadata.PAGE_COUNT, pageCount + "");
+ if(revNumber != null)
+ properties.setProperty(Metadata.REVISION_NUMBER, revNumber);
+ if(security != 0)
+ properties.setProperty(Metadata.RIGHTS, security + "");
+ if(subject != null)
+ properties.setProperty(Metadata.SUBJECT, subject);
+ if(template != null)
+ properties.setProperty(Metadata.TEMPLATE, template);
+ if(wordCount != 0)
+ properties.setProperty(Metadata.WORD_COUNT, wordCount + "");
+ propertiesBroker.setProperties(properties);
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html Fri Feb 10 09:08:23 2006
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>An Excel document parsing plugin.</p>
+<p>This package relies on Jakarta <a href="http://jakarta.apache.org/poi/index.html">POI</a>.</p>
+</body>
+</html>
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Fri Feb 10 09:08:23 2006
@@ -0,0 +1,64 @@
+/*
+ * TestMSExcelParser.java
+ * Based on the Unit Tests for MSWordParser by John Xing
+ */
+package org.apache.nutch.parse.msexcel;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+
+import junit.framework.TestCase;
+
+/**
+ * Based on Unit tests for MSWordParser by John Xing
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestMSExcelParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data",".");
+
+ // Make sure sample files are copied to "test.data"
+
+ private String[] sampleFiles = {"test.xls"};
+
+ private String expectedText = "BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! ";
+
+ public TestMSExcelParser(String name) {
+ super(name);
+ }
+
+ protected void setUp() {}
+
+ protected void tearDown() {}
+
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parser parser;
+ Parse parse;
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = ProtocolFactory.getProtocol(urlString);
+ content = protocol.getContent(urlString);
+
+ parser = ParserFactory.getParser(content.getContentType(), urlString);
+ parse = parser.getParse(content);
+
+ assertTrue(parse.getText().equals(expectedText));
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
------------------------------------------------------------------------------
svn:eol-style = native