You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/02 17:56:39 UTC

svn commit: r267226 - in /lucene/nutch/trunk/src/plugin: ./ parse-mspowerpoint/ parse-mspowerpoint/lib/ parse-mspowerpoint/sample/ parse-mspowerpoint/src/ parse-mspowerpoint/src/java/ parse-mspowerpoint/src/java/org/ parse-mspowerpoint/src/java/org/apa...

Author: jerome
Date: Fri Sep  2 08:55:47 2005
New Revision: 267226

URL: http://svn.apache.org/viewcvs?rev=267226&view=rev
Log:
NUTCH-21, Added parser plugin for MS PowerPoint slides (Stephan Strittmatter)

Added:
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html   (with props)
Modified:
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=267226&r1=267225&r2=267226&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Sep  2 08:55:47 2005
@@ -17,6 +17,7 @@
      <ant dir="parse-pdf" target="deploy"/>
      <ant dir="parse-rss" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
+     <ant dir="parse-mspowerpoint" target="deploy"/>
 <!-- <ant dir="parse-mp3" target="deploy"/> -->
 <!-- <ant dir="parse-rtf" target="deploy"/> -->
      <ant dir="parse-ext" target="deploy"/>
@@ -43,6 +44,7 @@
      <ant dir="parse-pdf" target="test"/>
      <ant dir="parse-rss" target="test"/>
      <ant dir="parse-msword" target="test"/>
+     <ant dir="parse-mspowerpoint" target="test"/>
  <!-- <ant dir="parse-mp3" target="test"/> -->
  <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-ext" target="test"/>
@@ -66,6 +68,7 @@
     <ant dir="parse-pdf" target="clean"/>
     <ant dir="parse-rss" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
+    <ant dir="parse-mspowerpoint" target="clean"/>
     <ant dir="parse-mp3" target="clean"/>
     <ant dir="parse-rtf" target="clean"/>
     <ant dir="parse-ext" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt Fri Sep  2 08:55:47 2005
@@ -0,0 +1,8 @@
+Pugin to support parsing of MS PowerPoint files.
+
+Contributed by Stephan Strittmatter <St...@sybit.de>. 
+
+Note:
+======
+For parsing MS PowerPoint files it is required to get the complete filestream.
+Please check the property <protocol>.content.limit at nutch-default.xml.
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml Fri Sep  2 08:55:47 2005
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+
+<project name="parse-mspowerpoint" default="jar">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.ppt" />
+			<include name="*.content" />
+			<include name="*.meta" />	
+		</fileset>
+	</copy>
+	
+	  <!-- ================================================================== -->
+	  <!-- Run unit tests                                                     --> 
+	  <!-- ================================================================== -->
+	  <target name="test" depends="compile-test, deploy" if="test.available">
+	    <echo message="Testing plugin: ${name}"/>
+
+	    <junit printsummary="yes" haltonfailure="no" fork="yes"
+	      errorProperty="tests.failed" failureProperty="tests.failed">
+	      <sysproperty key="test.data" value="${build.test}/data"/>
+	      <sysproperty key="test.input" value="${root}/data"/>
+	      <classpath refid="test.classpath"/>
+	      <formatter type="plain" />
+	      <batchtest todir="${build.test}" unless="testcase">
+	        <fileset dir="${src.test}"
+	                 includes="**/AllTests.java" excludes="**/${test.exclude}.java" />
+	      </batchtest>
+	      <batchtest todir="${build.test}" if="testcase">
+	        <fileset dir="${src.test}" includes="**/${testcase}.java"/>
+	      </batchtest>
+	    </junit>
+
+	    <fail if="tests.failed">Tests failed!</fail>
+
+	  </target>  
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar?rev=267226&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar?rev=267226&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml Fri Sep  2 08:55:47 2005
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-mspowerpoint"
+   name="MSPowerPoint Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-mspowerpoint.jar">
+         <export name="*"/>
+      </library>
+      <library name="poi-2.5.1-final-20040804.jar"/>
+      <library name="poi-scratchpad-2.5.1-final-20040804.jar"/>
+   </runtime>
+
+   <extension id="net.nutch.parse.mspowerpoint"
+              name="MSPowerPointParse" 
+              point="org.apache.nutch.parse.Parser">
+      <implementation id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" 
+                      class="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" 
+                      contentType="application/vnd.ms-powerpoint"
+                      pathSuffix=""/>
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt Fri Sep  2 08:55:47 2005
@@ -0,0 +1,6 @@
+Directory to store PowerPoint samples.
+If ther is a file "test.ppt" the testcase searches for a reference file "test.ppt.content"
+to compare the extracted content.
+Additionaly a file "test.ppt.meta" could be stored here to compair the meta information.
+
+-- Stephan Strittmatter

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt?rev=267226&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content Fri Sep  2 08:55:47 2005
@@ -0,0 +1,19 @@
+
+Nutch Parser Test 
+My initial test file for the PowerPoint parser of nutch 
+Second page 
+Test
+Of
+PowerPoint
+Extraction 
+Some Unicode 
+I do not know the content and I can not read it, just gathered from other ppt-files:
+Презентация PowerPoint
+
+Stephan Strittmatter │ル│ル│ル│ル Some Notes
+│ル│ルTextmasterformate durch Klicken bearbeiten
+Zweite Ebene
+Dritte Ebene
+Vierte Ebene
+Fnfte Ebene
+│ル│ル Notes of second page

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta Fri Sep  2 08:55:47 2005
@@ -0,0 +1,2 @@
+Title: Test
+Outlinks: 0

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,416 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.poi.hdf.extractor.Utils;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+
+/**
+ * Listener to read the content of PowerPoint file and transfere it to the
+ * passed <code>StringBuffer</code>.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ * 
+ */
+class ContentReaderListener implements POIFSReaderListener {
+
+  private static final Logger LOG = LogFormatter
+      .getLogger(ContentReaderListener.class.getName());
+
+  /** Buffer holding the content of the file */
+  protected final transient StringBuffer buf;
+
+  /**
+   * Constructs Listener to get content of PowerPoint file.
+   * 
+   * @param content
+   *          StringBuffer refereing the content of the PowerPoint file.
+   */
+  public ContentReaderListener(final StringBuffer content) {
+    this.buf = content;
+  }
+
+  /**
+   * Reads the internal PowerPoint document stream.
+   * 
+   * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
+   */
+  public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
+
+    if (event == null || event.getName() == null
+        || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
+      LOG.warning("Stream not processed. It is not a PowerPoint document: : "
+          + event.getName());
+      return;
+    }
+
+    try {
+      final DocumentInputStream dis = event.getStream();
+      final byte pptdata[] = new byte[dis.available()];
+      dis.read(pptdata, 0, dis.available());
+      int offset = 0;
+      long offsetPD = 0;
+
+      /*
+       * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
+       * in all PlaceHolders to hold PPTClientTextBox objects for mapping into
+       * Slide Objects
+       */
+      Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
+                                                                     * <Long,
+                                                                     * TextBox>
+                                                                     */();
+      // Traverse ByteArray to identiy edit paths of ClientTextBoxes
+      long n = pptdata.length - 20;
+      for (long i = 0; i < n; i++) {
+
+        final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
+        // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
+
+        if (PPTConstants.PPT_ATOM_USEREDIT == type) {
+          /*
+           * Checking the Record Header (UserEditAtom)
+           */
+          // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
+          // final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
+          offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
+          offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
+
+          /*
+           * Call to extract ClientTextBox text in each UserEditAtom
+           */
+          containerTextBox = extractTextBoxes(containerTextBox, offset,
+              pptdata, offsetPD);
+        } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
+          // LOG.finest("PPT_DRAWINGGROUP_ATOM ignored: " + type);
+        } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
+          // LOG.finest("PPT_TEXTBYTE_ATOM ignored: " + type);
+        } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
+          // LOG.finest("PPT_TEXTCHAR_ATOM ignored: " + type);
+        } else {
+          // no action
+          // LOG.finest("type not handled: " + type);
+        }
+      }
+
+      final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
+          offsetPD);
+
+      if (slides.size() == 0) {
+        LOG.info("No slides extracted!");
+
+      } else {
+        Slide slide = (Slide) slides.get(slides.size() - 1);
+
+        for (Enumeration enumeration = containerTextBox.elements(); enumeration
+            .hasMoreElements();) {
+          final TextBox textBox = (TextBox) enumeration.nextElement();
+          slide.addContent(textBox.getContent());
+        }
+
+        /*
+         * Merging TextBox data with Slide Data Printing the text from Slides
+         * vector object.
+         */
+        List scontent;
+        for (int i = 0; i < slides.size(); i++) {
+          slide = (Slide) slides.get(i);
+          scontent = slide.getContent();
+          String contentText;
+
+          for (int j = 0; j < scontent.size(); j++) {
+            contentText = scontent.get(j).toString();
+            this.buf.append(contentText);
+
+            // to avoid concatinated words we add a blank additional
+            if (contentText.length() > 0
+                && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
+              this.buf.append(" ");
+            }
+          }
+        }
+      }
+    } catch (Throwable ex) {
+      // because of not killing complete crawling all Throwables are catched.
+      LOG.throwing(this.getClass().getName(), "processPOIFSReaderEvent", ex);
+    }
+  }
+
+  /**
+   * Extracts the client text boxes of a slide.
+   * 
+   * @param containerTextBox
+   * @param offset
+   * @param pptdata
+   * @param offsetPD
+   * @return Hashtable
+   * @see TextBox
+   */
+  protected Hashtable/* <Long, TextBox> */extractTextBoxes(
+      final Hashtable/* <Long, TextBox> */containerTextBox, final int offset,
+      final byte[] pptdata, final long offsetPD) {
+
+    // To hold temporary data
+    FilteredStringWriter outStream = new FilteredStringWriter();
+
+    TextBox textBox;
+
+    // Traversing the bytearray up to Presist directory position
+    for (int i = offset; i < offsetPD - 20; i++) {
+      try {
+        // Record info
+        // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
+        // Record Type
+        final long recordType = LittleEndian.getUShort(pptdata, i + 2);
+        // Record Size
+        final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
+
+        if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+          /*
+           * Record type is of Drawing Group
+           */
+
+          // Total number of objects
+          // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
+          // 8);
+          // currentID = Group ID+number of objects
+          long currentID = LittleEndian.getInt(pptdata, i + 12);
+          currentID = ((int) (currentID / 1024)) * 1024;
+
+          if (currentID == PPTConstants.PPT_MASTERSLIDE) {
+            // Ignore Master Slide objects
+            LOG.finest("Ignore master slide.");
+            i++;
+            continue;
+          }
+
+          // Check for the ClientTextBox GroupID existence
+          if (containerTextBox.containsKey(new Long(currentID))) {
+            // If exists get Client Textbox Group
+            textBox = (TextBox) containerTextBox.get(new Long(currentID));
+            textBox.setContent("");
+
+          } else {
+            textBox = new TextBox(currentID);
+            containerTextBox.put(new Long(currentID), textBox);
+          }
+
+          /*
+           * Iterating the bytearray for TextCharAtoms and TextBytesAtom
+           */
+          if ((offsetPD - 20) != recordSize) {
+            // TODO something wrong? Probably an OLE-Object, which we ignore.
+            LOG.finer("offsetPD - 20=" + (offsetPD - 20) + " recordsize="
+                + recordSize);
+          } else {
+            for (int startPos = i + 8; startPos < offsetPD - 20
+                && startPos < recordSize; startPos++) { // && startPos <
+              // recordSize??
+              try {
+
+                // Record info
+                // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);
+
+                // Record Type
+                final long ntype = LittleEndian
+                    .getUShort(pptdata, startPos + 2);
+
+                // Record size
+                // Note that the size doesn't include the 8 byte atom header
+                final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);
+
+                if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+                  /*
+                   * Break the loop if next GroupID found
+                   */
+                  i = startPos - 1;
+                  break;
+                } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
+                  // TextByteAtom record
+                  outStream = new FilteredStringWriter();
+                  long ii = 0;
+                  for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
+                    // For loop to changed to a function
+                    // if ((ii + 2) >= pptdata.length)
+                    // break; // FIXME
+                    outStream.write((char) (pptdata[(int) ii + 2]));
+                  }
+
+                  // Setting the identified text for Current
+                  // groupID
+                  textBox.setContent(textBox.getContent()
+                      + outStream.toString());
+
+                } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
+                  // TextCharAtom record
+
+                  final String strTempContent = new String(pptdata,
+                      (int) startPos + 6, (int) (nsize) + 2);
+                  final byte bytes[] = strTempContent.getBytes();
+                  if (true) {
+                    // FIXME my version
+                    outStream = new FilteredStringWriter();
+                    for (int ii = 0; ii < bytes.length - 1; ii += 2) {
+                      // For loop to changed to a function
+                      outStream.write((char) (pptdata[ii + 2]));
+                    }
+                    textBox.setContent(textBox.getContent()
+                        + outStream.toString());
+                  } else {
+                    // this version is used within POI
+                    String text = StringUtil.getFromCompressedUnicode(bytes, 0,
+                        bytes.length);
+                    textBox.setContent(textBox.getContent() + text);
+                  }
+
+                } else {
+                  // ignored
+                  // LOG.finest("Ignored atom type: " + type);
+                }
+              } catch (Throwable e) {
+                LOG.throwing(this.getClass().getName(), "extractTextBoxes", e);
+                break;
+              }
+            }
+          }
+        } else {
+          /*
+           * Record type is ignored
+           */
+          // LOG.finest("Ignored record type: " + type);
+        }
+      } catch (Throwable ee) {
+        LOG.throwing(this.getClass().getName(), "extractClientTextBoxes", ee);
+        break;
+      }
+    }
+    return containerTextBox;
+  }
+
+  /**
+   * Returns the Powerpoint <code>Slide</code> s of document as vector.
+   * 
+   * @param offset
+   * @param pptdata
+   * @param offsetPD
+   * @return Vector of the powerpoint slides. Contains
+   *         <code>{@link Slide Slide}</code>
+   * @see Slide
+   */
+  protected List /* <Slide> */extractSlides(final long offset,
+      final byte[] pptdata, final long offsetPD) {
+
+    int sNum = 0;
+
+    // List of all slides found
+    final List/* <Slide> */slides = new Vector/* <Slide> */();
+
+    // current slide data
+    Slide currentSlide = null;
+
+    // To store data found in TextCharAtoms and TextBytesAtoms
+    FilteredStringWriter outStream;
+
+    for (long i = offset; i < pptdata.length - 20; i++) {
+
+      final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
+      final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
+      final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
+
+      if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
+        /*
+         * TextByteAtom record
+         */
+        outStream = new FilteredStringWriter();
+
+        for (long ii = i + 6; (ii <= i + 6 + atomSize)
+            && (ii + 2 < pptdata.length); ii++) {
+          try {
+            // if(ii+2 >= pptdata.length) break; //FIXME
+            byte value = pptdata[(int) ii + 2];
+            outStream.write(value);
+          } catch (ArrayIndexOutOfBoundsException ex) {
+            LOG.finest("size=" + pptdata.length);
+            LOG.throwing(this.getClass().getName(), "extractSlides", ex);
+          }
+        }
+
+        // Setting the identified text for Current Slide
+        if (currentSlide != null) {
+          currentSlide.addContent(outStream.toString());
+        }
+
+      } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
+        /*
+         * TextCharAtom record
+         */
+        outStream = new FilteredStringWriter();
+        final String strTempContent = new String(pptdata, (int) i + 6,
+            (int) (atomSize) + 2);
+        final byte bytes[] = strTempContent.getBytes();
+
+        for (int ii = 0; ii < bytes.length - 1; ii += 2) {
+          outStream.write(Utils.getUnicodeCharacter(bytes, ii));
+        }
+
+        // Setting the identified text for Current Slide
+        if (currentSlide != null) {
+          currentSlide.addContent(outStream.toString());
+        }
+
+      } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
+        /*
+         * SlidePresistAtom Record
+         */
+        if (sNum != 0) {
+          outStream = new FilteredStringWriter();
+
+          final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20);
+
+          currentSlide = new Slide(slideID);
+          // currentSlide.addContent(outStream.toString());
+          slides.add(currentSlide);
+        }
+        sNum++;
+      } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+        /*
+         * Diagram records are ignored
+         */
+        LOG.finest("Drawing Groups are ignored.");
+        break;
+      } else {
+        // ignored
+        // LOG.finest("Unhandled atomType: " + atomType);
+      }
+    }
+
+    return slides;
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.StringWriter;
+
+/**
+ * Writes to optimize ASCII output. Not needed chars are filtered (ignored).
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @create 19.01.2005
+ */
+public class FilteredStringWriter extends StringWriter {
+
+  /**
+   * @see StringWriter#StringWriter()
+   */
+  public FilteredStringWriter() {
+    super();
+  }
+
+  /**
+   * @param initialSize
+   * @see StringWriter#StringWriter(int)
+   */
+  public FilteredStringWriter(final int initialSize) {
+    super(initialSize);
+  }
+
+  /**
+   * Chars which are not useful for Nutch indexing are filtered (ignored) on
+   * writing to the writer.
+   * 
+   * @see java.io.Writer#write(int)
+   */
+  public void write(final int ch) {
+    if (ch == '\r') {
+      // PowerPoint seems to store files with \r as the line break
+      // -> unify to platform specific format
+      super.write(System.getProperty("line.separator"));
+    } else if (ch == 0) {
+      super.write(System.getProperty("line.separator"));
+    } else if (ch == '\b') {
+      // ignore it
+    } else if (Character.isISOControl((char) ch)) {
+      // replace by blank
+      // super.write(' ');
+    } else if (Character.isWhitespace((char) ch)) {
+      // unify to blank
+      super.write(' ');
+    } else {
+      super.write(ch);
+    }
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,189 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ * Nutch-Parser for parsing MS PowerPoint slides ( mime type:
+ * application/vnd.ms-powerpoint).
+ * <p>
+ * It is based on org.apache.poi.*.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @see <a href="http://jakarta.apache.org/poi">Jakarta POI</a>
+ * @version 1.0
+ */
+public class MSPowerPointParser implements Parser {
+
+  /** associated Mime type for PowerPoint files (application/vnd.ms-powerpoint) */
+  public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
+
+  private static final Logger LOG = LogFormatter
+      .getLogger(MSPowerPointParser.class.getName());
+
+  /**
+   * 
+   */
+  public MSPowerPointParser() {
+  }
+
+  /**
+   * 
+   */
+  public MSPowerPointParser(String fileName) {
+
+  }
+
+  /**
+   * Main for testing. Pass a ppt-file as argument
+   * 
+   * @param args
+   */
+  public static void main(String args[]) {
+    if (args.length < 1) {
+      System.err.println("Useage:");
+      System.err.println("\tMSPowerPointParser <file>");
+      System.exit(1);
+    }
+
+    String file = args[0];
+    MSPowerPointParser ppe = new MSPowerPointParser();
+
+    byte[] raw = getRawBytes(new File(file));
+
+    Properties prop = new Properties();
+    prop.setProperty("Content-Length", "" + raw.length);
+
+    Content content = new Content(file, file, raw, MIME_TYPE, prop);
+
+    System.out.println(ppe.getParse(content).getText());
+  }
+
+  /**
+   * Parses the MS PowerPoint file.
+   * 
+   * @see org.apache.nutch.parse.Parser#getParse(Content)
+   */
+  public Parse getParse(final Content content) {
+
+    // check that contentType is one we can handle
+    final String contentType = content.getContentType();
+
+    if (contentType != null && !contentType.startsWith(MIME_TYPE)) {
+      return new ParseStatus(ParseStatus.FAILED,
+          ParseStatus.FAILED_INVALID_FORMAT, "Content-Type is not ["
+              + MIME_TYPE + "] was: " + contentType).getEmptyParse();
+    }
+
+    String plainText = null;
+    String title = null;
+    Outlink[] outlinks = null;
+    Properties properties = null;
+
+    try {
+      final String contentLen = content.get("Content-Length");
+      final byte[] raw = content.getContent();
+
+      if (contentLen != null && raw.length != Integer.parseInt(contentLen)) {
+        return new ParseStatus(
+            ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED,
+            "Content truncated at "
+                + raw.length
+                + " bytes. Please increase <protocol>.content.limit at nutch-default.xml. "
+                + "Parser can't handle incomplete PowerPoint files.")
+            .getEmptyParse();
+      }
+
+      final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream(
+          raw));
+
+      plainText = extractor.getText();
+      properties = extractor.getProperties();
+      outlinks = this.getOutlinks(plainText, content.getUrl());
+
+    } catch (Exception e) {
+      LOG.throwing(this.getClass().getName(), "getParse", e);
+      return new ParseStatus(e).getEmptyParse();
+    }
+
+    // collect meta data
+    final Properties metadata = new Properties();
+    metadata.putAll(content.getMetadata()); // copy through
+
+    if (properties != null) {
+      title = properties.getProperty("Title");
+      properties.remove("Title");
+      metadata.putAll(properties);
+    }
+
+    if (plainText == null) {
+      plainText = "";
+    }
+
+    if (title == null) {
+      title = "";
+    }
+
+    final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    final ParseData parseData = new ParseData(status, title, outlinks, metadata);
+
+    LOG.finest("PowerPoint file parsed sucessful.");
+    return new ParseImpl(plainText, parseData);
+  }
+
+  /**
+   * Collect outlinks of document.
+   * 
+   * @param plainText
+   * 
+   * @return Array of links within the PowerPoint file
+   */
+  protected Outlink[] getOutlinks(String plainText, String anchor) {
+    return OutlinkExtractor.getOutlinks(plainText, anchor);
+  }
+  
+  private final static byte[] getRawBytes(File f) {
+    try {
+      if (!f.exists())
+        return null;
+      FileInputStream fin = new FileInputStream(f);
+      byte[] buffer = new byte[(int) f.length()];
+      fin.read(buffer);
+      fin.close();
+      return buffer;
+    } catch (Exception err) {
+      err.printStackTrace();
+      return null;
+    }
+
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,228 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s 
+ * / URLs from plain text using Regular Expressions.
+ * 
+ * @see <a
+ *      href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
+ *      of different regexp-Implementations </a>
+ * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs
+ *      </a>
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+  private static final Logger LOG = LogFormatter
+      .getLogger(OutlinkExtractor.class.getName());
+
+  /**
+   * Regex pattern to get URLs within a plain text.
+   * 
+   * @see <a
+   *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+   *      </a>
+   */
+  private static final String URL_PATTERN = 
+    "([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)";
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text.
+   * 
+   * @param plainText  the plain text from wich URLs should be extracted.
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText) {
+    return OutlinkExtractor.getOutlinks(plainText, "");
+  }
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text and adds anchor
+   * to the extracted <code>Outlink</code>s
+   * 
+   * @param plainText the plain text from wich URLs should be extracted.
+   * @param anchor    the anchor of the url
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, String anchor) {
+
+    final List outlinks = new ArrayList();
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(URL_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      //loop the matches
+      while (matcher.contains(input, pattern)) {
+        result = matcher.getMatch();
+        url = result.group(0);
+        outlinks.add(new Outlink(url, anchor));
+      }
+    } catch (Exception ex) {
+      // if it is a malformed URL we just throw it away and continue with
+      // extraction.
+      LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex);
+    }
+
+    final Outlink[] retval;
+
+    //create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+  
+
+  /**
+   * Extracts outlinks from a plain text. <br />
+   * This Method takes the Jakarta Regexp API.
+   * 
+   * @param plainText
+   * 
+   * @return Array of <code>Outlink</code> s within found in plainText
+   * @deprecated only for tests
+   */
+  private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+    // final List outlinks = new ArrayList();
+    // String url;
+    // Outlink link;
+    //
+    // RE re = new RE(URL_PATTERN);
+    //
+    // int pos = 0;
+    //
+    // while (re.match(plainText, pos)) {
+    //
+    // url = re.getParen(0);
+    //
+    // LOG.finest("Extracted url: " + url);
+    //
+    // try {
+    //
+    // link = new Outlink(url, null);
+    // outlinks.add(link);
+    //
+    // } catch (MalformedURLException ex) {
+    // // if it is a malformed URL we just throw it away and continue with
+    // // extraction.
+    // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+    // }
+    //
+    // pos = re.getParenEnd(0);
+    // }
+    //
+    // final Outlink[] retval;
+    //
+    // if (pos > 0) {
+    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    // } else {
+    // retval = new Outlink[0];
+    // }
+    //
+    // return retval;
+
+  }
+
+  /**
+   * Extracts outlinks from a plain text.
+   * </p>
+   * This Method takes the JDK5 Regexp API.
+   * 
+   * @param plainText
+   * 
+   * @return Array of <code>Outlink</code> s within found in plainText
+   * @deprecated only for tests
+   */
+  private Outlink[] getOutlinksJDK5Impl(final String plainText) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+    // final List outlinks = new ArrayList();
+    // String url;
+    // Outlink link;
+    //
+    // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+    // final RE re = new RE(urlPattern);
+    //
+    // int pos = 0;
+    //
+    // while (re.match(plainText, pos)) {
+    //
+    // url = re.getParen(0);
+    //
+    // try {
+    //
+    // link = new Outlink(url, null);
+    // outlinks.add(link);
+    // } catch (MalformedURLException ex) {
+    // // if it is a malformed URL we just throw it away and continue with
+    // // extraction.
+    // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+    // }
+    //
+    // pos = re.getParenEnd(0);
+    // }
+    //
+    // final Outlink[] retval;
+    //
+    // if (pos > 0) {
+    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    // } else {
+    // retval = new Outlink[0];
+    // }
+    //
+    // return retval;
+  }
+ 
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+/**
+ * Package protected class for the required internal MS PowerPoint constants.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+class PPTConstants {
+
+  /** ID of master slide */
+  public static final long PPT_MASTERSLIDE = 1024L;
+
+  public static long PPT_ATOM_SLIDE = 1007l;
+
+  /** ATOM ID of notes */
+  public static final long PPT_ATOM_NOTES = 1009L;
+
+  /** ATOM ID of persistend slide */
+  public static final long PPT_ATOM_SLIDEPERSISTANT = 1011L;
+
+  /** ATOM ID of text char area. Holds text in byte swapped unicode form. */
+  public static final long PPT_ATOM_TEXTCHAR = 4000L;
+
+  /** ATOM ID of text byte area. Holds text in ascii form */
+  public static final long PPT_ATOM_TEXTBYTE = 4008L;
+
+  /** ATOM ID of user edit area */
+  public static final long PPT_ATOM_USEREDIT = 4085L;
+
+  /** ATOM ID of drawing group area */
+  public static final long PPT_ATOM_DRAWINGGROUP = 61448L;
+
+  /** Name for PowerPoint Documents within the file */
+  public static final String POWERPOINT_DOCUMENT = "PowerPoint Document";
+
+
+
+  /**
+   * Protected constructor to prevent instantiation.
+   */
+  protected PPTConstants() {
+    // nothing
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Converts the Powerpoint document content to plain text.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+
+public class PPTExtractor {
+
+  private static final Logger LOG = LogFormatter.getLogger(PPTExtractor.class
+      .getName());
+
+  /** Parsed plain Powerpoint Text */
+  private final transient StringBuffer contentBuf;
+
+  private final PropertiesBroker propertiesBroker;
+
+  private final POIFSReader poireader;
+
+  /**
+   * Constructor that takes a PowerPoint file as <code>InputStream</code> to
+   * parse it.
+   * 
+   * @param in
+   *          <code>InputStream</code> containing the PowerPoint file
+   * @throws PowerPointDocumentException
+   *           thrown if parsing failed
+   */
+  public PPTExtractor(final InputStream in) throws PowerPointDocumentException {
+    this.poireader = new POIFSReader();
+    this.propertiesBroker = new PropertiesBroker();
+    this.contentBuf = new StringBuffer();
+
+    this.init(in);
+  }
+
+  /**
+   * Get the PowerPoint content text as plain text
+   * 
+   * @return String the content text
+   */
+  public String getText() {
+    return this.contentBuf.toString();
+  }
+
+  /**
+   * Get the <code>Properties</code> of the PowerPoint document.
+   * 
+   * @return the properties of the document
+   */
+  public Properties getProperties() {
+    return this.propertiesBroker.getProperties();
+  }
+
+  /**
+   * @param input
+   * @throws PowerPointDocumentException
+   */
+  private void init(final InputStream input) throws PowerPointDocumentException {
+    // register listener for SummaryInformation
+    this.poireader.registerListener(new PropertiesReaderListener(
+        this.propertiesBroker), SummaryInformation.DEFAULT_STREAM_NAME);
+
+    // register listener for PPT-document content
+    this.poireader.registerListener(new ContentReaderListener(this.contentBuf),
+        PPTConstants.POWERPOINT_DOCUMENT);
+
+    try {
+      input.reset();
+      if (input.available() > 0) {
+        this.poireader.read(input);
+      } else {
+        LOG.warning("Input <=0 :" + input.available());
+      }
+    } catch (IOException e) {
+      throw new PowerPointDocumentException(e);
+    }
+  }
+
+  /**
+   * The PropertiesBroker
+   * 
+   * @author Stephan Strittmatter
+   * @version 1.0
+   */
+  static class PropertiesBroker {
+
+    private final static int TIMEOUT = 2 * 1000;
+
+    private Properties properties = null;
+
+    /**
+     * Get the collected properties.
+     * 
+     * @return properties of the PowerPoint file
+     */
+    public synchronized Properties getProperties() {
+
+      final long start = new Date().getTime();
+      long now = start;
+
+      while (this.properties == null && now - start < TIMEOUT) {
+        try {
+          wait(TIMEOUT / 10);
+        } catch (InterruptedException e) {
+        }
+        now = new Date().getTime();
+      }
+
+      notifyAll();
+
+      return this.properties;
+    }
+
+    /**
+     * 
+     * @param properties
+     */
+    public synchronized void setProperties(Properties properties) {
+      this.properties = properties;
+      notifyAll();
+    }
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.IOException;
+
+/**
+ * Exception class used for catching the runtime exceptions for the Powerpoint
+ * slides.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+
+public class PowerPointDocumentException extends Exception {
+
+  /** Comment for <code>serialVersionUID</code> */
+  private static final long serialVersionUID = 3256438093031487028L;
+
+  /**
+   * A constructor that builds the Exception object
+   * 
+   * @param message
+   */
+  public PowerPointDocumentException(String message) {
+    super(message);
+  }
+
+  /**
+   * A constructor that builds the Exception object
+   * 
+   * @param message
+   * @param cause
+   */
+  public PowerPointDocumentException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  /**
+   * @param e
+   */
+  public PowerPointDocumentException(Exception e) {
+    super(e);
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,130 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Properties;
+import java.util.TimeZone;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.mspowerpoint.PPTExtractor.PropertiesBroker;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+/**
+ * Listener for retrieving the properties of document.
+ * 
+ * @author Stephan Strittmatter
+ * 
+ * @version 1.0
+ */
+class PropertiesReaderListener implements POIFSReaderListener {
+  private static final Logger LOG = LogFormatter
+      .getLogger(PropertiesReaderListener.class.getName());
+
+  private static final String TIME_ZONE_ID = "GTM";
+
+  private final transient PropertiesBroker propertiesBroker;
+
+  /** DateFormatter for transfereing dates do strings. */
+  private final transient SimpleDateFormat dateFormatter = new SimpleDateFormat();
+
+  /** Properties of the powerpoint Document */
+  private final transient Properties properties;
+
+  /**
+   * Listener for retrieving the properties of document.
+   * 
+   * @param propertiesBroker
+   */
+  public PropertiesReaderListener(final PropertiesBroker propertiesBroker) {
+    this.propertiesBroker = propertiesBroker;
+    this.dateFormatter.setTimeZone(TimeZone.getTimeZone(TIME_ZONE_ID));
+    this.properties = new Properties();
+  }
+
+  /**
+   * Process the properties of the document and adds them to property object.
+   * 
+   * @param event
+   *          contains the document to be parsed
+   */
+  public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+
+    if (event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
+
+      try {
+        final SummaryInformation sInfo = (SummaryInformation) PropertySetFactory
+            .create(event.getStream());
+
+        addProperty("Title", sInfo.getTitle());
+        addProperty("Subject", sInfo.getSubject());
+        addProperty("Keywords", sInfo.getKeywords());
+        addProperty("Comments", sInfo.getComments());
+        addProperty("Author", sInfo.getAuthor());
+        addProperty("Last-Author", sInfo.getLastAuthor());
+
+        /*
+         * already provided by nutch
+         */
+        // addProperty("Saved-Date", si.getLastSaveDateTime());
+        /*
+         * following properties are not required for indexing/searching
+         */
+        // addProperty("Word-Count", si.getWordCount());
+        // addProperty("Page-Count", si.getPageCount());
+        // addProperty("Character Count", si.getCharCount());
+        // addProperty("Revision-Number", si.getRevNumber());
+        // addProperty("Creation-Date", si.getEditTime());
+        // addProperty("Edit-Time", si.getEditTime());
+        // addProperty("Last-Printed", si.getLastPrinted());
+        // addProperty("Template", si.getTemplate());
+        // addProperty("Security", si.getSecurity());
+        // addProperty("Application-Name", si.getApplicationName());
+      } catch (Exception ex) {
+        LOG.throwing(this.getClass().getName(), "processPOIFSReaderEvent", ex);
+      }
+      
+    } else {
+      LOG.warning("Wrong stream not processed: " + event.getName());
+    }
+
+    this.propertiesBroker.setProperties(this.properties);
+  }
+
+  protected void addProperty(final String name, final long value) {
+    if (value != 0) {
+      this.properties.setProperty(name, String.valueOf(value));
+    }
+  }
+
+  protected void addProperty(final String name, final String value) {
+    if (value != null) {
+      this.properties.setProperty(name, value);
+    }
+  }
+
+  protected void addProperty(final String name, final Date value) {
+    if (value != null) {
+      this.properties.setProperty(name, this.dateFormatter.format(value));
+    }
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.util.List;
+import java.util.Vector;
+
+/**
+ * Package protected class for a MS Powerpoint slide.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+class Slide {
+
+
+  /** Holds the Slide Number */
+  protected transient final long slideNumber;
+
+  /** Holds the contents of the Slide */
+  protected transient final List/* <String> */contents;
+
+  /**
+   * Initialise the Object for holding the contents of Power Point Slide
+   * 
+   * @param number
+   */
+  public Slide(long number) {
+    this.slideNumber = number;
+    this.contents = new Vector/* <String> */();
+  }
+
+  /**
+   * Add the Content of Slide to this Object
+   * 
+   * @param content
+   */
+  public void addContent(String content) {
+    this.contents.add(content);
+  }
+
+  /**
+   * returns the contents of slide as a vector object
+   * 
+   * @return Vector
+   */
+  public List getContent() {
+    return this.contents;
+  }
+
+  /**
+   * returns the slide value
+   * 
+   * @return long
+   */
+  public long getSlideNumber() {
+    return this.slideNumber;
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+/**
+ * Package protected class for the MS Powerpoint TextBox content
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+class TextBox {
+
+  /**
+   * Current id of a text box
+   */
+  protected transient final long currentID;
+
+  /**
+   * Content of text box
+   */
+  protected String content;
+
+  /**
+   * Instantiates the text box object
+   * 
+   * @param textBoxId
+   *          id of text box
+   */
+  public TextBox(final long textBoxId) {
+    this.currentID = textBoxId;
+    this.content = "";
+  }
+
+  /**
+   * Instantiates the text box object
+   * 
+   * @param textBoxId
+   *          id of text box
+   * @param content
+   *          content of text box
+   */
+  public TextBox(final long textBoxId, final String content) {
+    this.currentID = textBoxId;
+    this.content = content;
+  }
+
+  /**
+   * Sets the content of the text box
+   * 
+   * @param content
+   *          content of text Box
+   */
+  public void setContent(final String content) {
+    this.content = content;
+  }
+
+  /**
+   * Returns the content of the text box
+   * 
+   * @return content of text box
+   */
+  public String getContent() {
+    return this.content;
+  }
+
+  /**
+   * Returns the current text box id
+   * 
+   * @return long
+   */
+  public long getCurrentId() {
+    return this.currentID;
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html Fri Sep  2 08:55:47 2005
@@ -0,0 +1,35 @@
+<!--
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<html>
+	<head>
+		<title>Microsoft PowerPoint document parsing plugin.</title>
+	</head>
+	<body>
+		<p>A Microsoft &copy; PowerPoint document parsing plugin.</p>
+		<p>This package relies on <a 
+			href="http://www.apache.org/poi/index.html">POI</a>.</p>
+		<p> Implementation based on sources found at <a 
+			href="http://groups.google.com/groups?selm=a4f8800541bc694d5af7dabb35e83b72%40localhost.talkaboutsoftware.com">Google 
+			Groups </a>. It can also be found at <a 
+			href="http://www.mail-archive.com/poi-user@jakarta.apache.org/msg04809.html">http://www.mail-archive.com/poi-user@jakarta.apache.org/msg04809.html</a> 
+			written by Hari Shanker and Sudhakar Chavali. Thanks for the basic 
+			work!</p>
+		<p>I changed these classes to support also Unicode content and 
+			optimized them for Nuch.</p>
+	</body>
+</html>
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.File;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+public class AllTests {
+
+  /** This system property is defined in ./src/plugin/build-plugin.xml */
+  private final static String SAMPLE_DIR = System.getProperty("test.data",
+      "build/parse-mspowerpoint/test/data");
+
+  /**
+   * Main to run the test
+   * 
+   * @param args
+   *          not required
+   */
+  public static void main(String[] args) {
+    junit.textui.TestRunner.run(AllTests.suite());
+  }
+
+  /**
+   * @return Test for the PowerPoint plugin
+   */
+  public static Test suite() {
+    final TestSuite suite = new TestSuite(
+        "Test for org.apache.nutch.parse.mspowerpoint");
+    
+    System.out.println("Testing with ppt-files of dir: " + SAMPLE_DIR);
+    
+    final File sampleDir = new File(SAMPLE_DIR);
+
+    //find all ppt-files in the test-directory
+    final FileExtensionFilter pptFilter = new FileExtensionFilter(".ppt");
+    final String[] pptFiles = sampleDir.list(pptFilter);
+
+    if(pptFiles== null)
+    {
+      throw new IllegalArgumentException(SAMPLE_DIR + " does not contain any files: " + pptFilter);
+    }
+    TestSuite suiteAllFiles;
+    
+
+    // iterate over all ppt-files which are found and test against them
+    for (int i = 0; i < pptFiles.length; i++) {
+      //test the content...
+      suiteAllFiles = new TestSuite("Testing file [" + pptFiles[i] + "]");
+      TestCase test = new TestMSPowerPointParser(new File(pptFiles[i]));
+      test.setName("testContent");
+
+      suiteAllFiles.addTest(test);
+
+      //..then the properties
+      TestCase test2 = new TestMSPowerPointParser(new File(pptFiles[i]));
+      test2.setName("testMeta");
+      suiteAllFiles.addTest(test2);
+
+      suite.addTest(suiteAllFiles);
+    }
+
+    return suite;
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.File;
+import java.io.FilenameFilter;
+
+/**
+ * Helper class to filter for specific files to test them.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+/* package protected */class FileExtensionFilter implements FilenameFilter {
+  private String ext = "*";
+
+  /**
+   * @param ext
+   */
+  public FileExtensionFilter(String ext) {
+    this.ext = ext;
+  }
+
+  /* (non-Javadoc)
+   * @see java.io.FilenameFilter#accept(java.io.File, java.lang.String)
+   */
+  public boolean accept(File dir, String name) {
+    if (name.endsWith(this.ext))
+      return true;
+    return false;
+  }
+
+  /* (non-Javadoc)
+   * @see java.lang.Object#toString()
+   */
+  public String toString() {
+    // TODO Auto-generated method stub
+    return this.ext;
+  }
+}
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Fri Sep  2 08:55:47 2005
@@ -0,0 +1,259 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.logging.Logger;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ * <p>
+ * Unit tests for MSPowerPointParser.
+ * </p>
+ * <p>
+ * Make sure sample files are copied to "test.data" as specified in
+ * ./src/plugin/parse-mspowerpoint/build.xml during plugin compilation. Check
+ * ./src/plugin/parse-mspowerpoint/sample/README.txt for what they are.
+ * </p>
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+public class TestMSPowerPointParser extends TestCase {
+  private static final Logger LOG = LogFormatter
+      .getLogger(TestMSPowerPointParser.class.getName());
+
+  private static final String CHARSET = "UTF-8";
+
+  private final static String LINE_SEPARATOR = System.getProperty("line.separator");
+
+  /** This system property is defined in ./src/plugin/build-plugin.xml */
+  private final static String SAMPLE_DIR = System.getProperty("test.data",
+      "build/parse-mspowerpoint/test/data");
+
+  private final File sampleDir = new File(SAMPLE_DIR);
+
+  /**
+   * Wether dumping the extracted data to file for visual checks.
+   */
+  private final static boolean DUMP_TO_FILE = true;
+
+  private final File testFile;
+
+  private String urlString;
+
+  private Protocol protocol;
+
+  private Content content;
+
+  /**
+   * 
+   * @param name
+   */
+  public TestMSPowerPointParser(String name) {
+    super(name);
+    this.testFile = new File(this.sampleDir, "test.ppt");
+  }
+
+  /**
+   * @param file
+   */
+  public TestMSPowerPointParser(File file) {
+    super();
+    this.testFile = file;
+  }
+
+  /**
+   * @see TestCase#setUp()
+   */
+  protected void setUp() throws Exception {
+    super.setUp();
+
+    this.urlString = createUrl(this.testFile.getName());
+
+    System.out.println("Testing file: " + this.urlString + "...");
+    this.protocol = ProtocolFactory.getProtocol(this.urlString);
+    this.content = this.protocol.getProtocolOutput(this.urlString).getContent();
+  }
+
+  /**
+   * @see TestCase#tearDown()
+   */
+  protected void tearDown() throws Exception {
+    super.tearDown();
+  }
+
+  /**
+   * Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
+   * parsable without exceptions.
+   * 
+   * @see #SAMPLE_DIR
+   * @throws Exception
+   */
+  public void testContent() throws Exception {
+
+    Parser parser = ParserFactory.getParser(this.content.getContentType(),
+        this.urlString);
+    Parse parse = parser.getParse(this.content);
+
+    ParseData data = parse.getData();
+    String text = parse.getText();
+
+    assertTrue("No content extracted length ==0", text.length() > 0);
+    
+    this.dumpToFile(this.testFile.getName(), data, text);
+
+    final FileExtensionFilter contentFilter = new FileExtensionFilter(
+        this.testFile.getName() + ".content");
+    final File[] contentFiles = this.sampleDir.listFiles(contentFilter);
+
+    if (contentFiles.length > 0) {
+      String testContent = this.fileToString(contentFiles[0]);
+
+      for (int i = 0; i < text.length(); i++) {
+        char parsedChar = text.charAt(i);
+        char testChar = testContent.charAt(i);
+        assertEquals("Wrong char at position [" + i + "]", "" + testChar, ""
+            + parsedChar);
+      }
+    } else {
+      LOG.info("Comparison file for Content not available: "
+          + this.testFile.getName() + ".content");
+    }
+  }
+
+  /**
+   * Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
+   * parsable without exceptions.
+   * 
+   * @see #SAMPLE_DIR
+   * @throws Exception
+   */
+  public void testMeta() throws Exception {
+
+    Parser parser = ParserFactory.getParser(this.content.getContentType(),
+        this.urlString);
+    Parse parse = parser.getParse(this.content);
+
+    ParseData data = parse.getData();
+
+    final FileExtensionFilter titleFilter = new FileExtensionFilter(
+        this.testFile.getName() + ".meta");
+    final File[] titleFiles = this.sampleDir.listFiles(titleFilter);
+
+    if (titleFiles.length > 0) {
+      assertEquals("Document Title", this.fileToString(titleFiles[0]),
+          "Title: " + data.getTitle() + LINE_SEPARATOR +
+          "Outlinks: " + data.getOutlinks().length + LINE_SEPARATOR);
+    } else {
+      assertTrue("Document Title length ==0", data.getTitle().length() > 0);
+      LOG.info("Comparison file for Title not available: "
+          + this.testFile.getName() + ".meta");
+    }
+  }
+
+  /**
+   * create complete url
+   * 
+   * @param fileName
+   *          name of the file
+   * @return complete url.
+   */
+  private String createUrl(final String fileName) {
+    return "file:" + SAMPLE_DIR + "/" + fileName;
+  }
+
+  /**
+   * Dump the parsed data to a UTF-8 formatted file for visual checks.
+   * 
+   * @param data
+   * @param text
+   * @param fileName
+   * @throws IOException
+   */
+  private void dumpToFile(final String fileName, final ParseData data,
+      final String text) throws IOException {
+    if (TestMSPowerPointParser.DUMP_TO_FILE) {
+
+      final File file = new File(fileName + ".txt");
+
+      final FileOutputStream fos = new FileOutputStream(file);
+      final OutputStreamWriter osw = new OutputStreamWriter(fos, CHARSET);
+
+      osw.write(data.toString());
+      osw.write(text);
+
+      osw.close();
+      fos.close();
+    }
+  }
+
+  /**
+   * Load the testfiles for comparison.
+   * 
+   * @param file
+   *          file to load
+   * @return UNF-8 encoded String content of file.
+   * @throws IOException
+   */
+  private String fileToString(final File file) throws IOException {
+    FileInputStream fis = null;
+    //InputStreamReader isr = null;
+    BufferedReader br = null;
+    final StringBuffer buf = new StringBuffer();
+
+    try {
+      fis = new FileInputStream(file);
+      br = new BufferedReader(new InputStreamReader(fis, CHARSET));
+
+      String line = br.readLine();
+      while (line != null) {
+        buf.append(line).append(LINE_SEPARATOR);
+        line = br.readLine();
+      }
+    } finally {
+      if (br != null) {
+        br.close();
+      }
+      if (fis != null) {
+        fis.close();
+      }
+    }
+
+    String val = buf.toString();
+
+    return val;
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html Fri Sep  2 08:55:47 2005
@@ -0,0 +1,44 @@
+<!--
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<html>
+	<head>
+		<title>Testing Package of Microsoft PowerPoint document parsing plugin.</title>
+	</head>
+	<body>
+		<h1>JUnit Testpackage for Microsoft &copy; PowerPoint document parsing 
+			plugin.</h1>
+		<p>The example ppt-files are located in the subdirectory 
+			<code>src/plugins/parse-mspowerpoint/samples</code>. They are 
+			copied by the ant-task to the directory 
+			<code>src/plugins/parse-mspowerpoint/data</code> for testing. 
+			Additionaly to the ppt-files, there could also be stored files with 
+			the same name with postfix ".content" or ".title". If they exists, 
+			the test classes are checking the extracted content against these 
+			files.</p>
+		<h3>Example</h3>
+		<p>
+			<ul>
+				<li>PowerPoint file to test: <code>test.ppt</code></li>
+				<li>Reference file with content: 
+					<code>test.ppt.content</code></li>
+				<li> Reference file with title and properties: 
+					<code>test.ppt.title</code></li>
+			</ul>
+		</p>
+	</body>
+</html>
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html
------------------------------------------------------------------------------
    svn:eol-style = native