You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/02 17:56:39 UTC
svn commit: r267226 - in /lucene/nutch/trunk/src/plugin: ./
parse-mspowerpoint/ parse-mspowerpoint/lib/ parse-mspowerpoint/sample/
parse-mspowerpoint/src/ parse-mspowerpoint/src/java/
parse-mspowerpoint/src/java/org/ parse-mspowerpoint/src/java/org/apa...
Author: jerome
Date: Fri Sep 2 08:55:47 2005
New Revision: 267226
URL: http://svn.apache.org/viewcvs?rev=267226&view=rev
Log:
NUTCH-21, Added parser plugin for MS PowerPoint slides (Stephan Strittmatter)
Added:
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (with props)
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html (with props)
Modified:
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=267226&r1=267225&r2=267226&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Sep 2 08:55:47 2005
@@ -17,6 +17,7 @@
<ant dir="parse-pdf" target="deploy"/>
<ant dir="parse-rss" target="deploy"/>
<ant dir="parse-msword" target="deploy"/>
+ <ant dir="parse-mspowerpoint" target="deploy"/>
<!-- <ant dir="parse-mp3" target="deploy"/> -->
<!-- <ant dir="parse-rtf" target="deploy"/> -->
<ant dir="parse-ext" target="deploy"/>
@@ -43,6 +44,7 @@
<ant dir="parse-pdf" target="test"/>
<ant dir="parse-rss" target="test"/>
<ant dir="parse-msword" target="test"/>
+ <ant dir="parse-mspowerpoint" target="test"/>
<!-- <ant dir="parse-mp3" target="test"/> -->
<!-- <ant dir="parse-rtf" target="test"/> -->
<ant dir="parse-ext" target="test"/>
@@ -66,6 +68,7 @@
<ant dir="parse-pdf" target="clean"/>
<ant dir="parse-rss" target="clean"/>
<ant dir="parse-msword" target="clean"/>
+ <ant dir="parse-mspowerpoint" target="clean"/>
<ant dir="parse-mp3" target="clean"/>
<ant dir="parse-rtf" target="clean"/>
<ant dir="parse-ext" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt Fri Sep 2 08:55:47 2005
@@ -0,0 +1,8 @@
+Pugin to support parsing of MS PowerPoint files.
+
+Contributed by Stephan Strittmatter <St...@sybit.de>.
+
+Note:
+======
+For parsing MS PowerPoint files it is required to get the complete filestream.
+Please check the property <protocol>.content.limit at nutch-default.xml.
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/README.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml Fri Sep 2 08:55:47 2005
@@ -0,0 +1,42 @@
+<?xml version="1.0"?>
+
+<project name="parse-mspowerpoint" default="jar">
+
+ <import file="../build-plugin.xml" />
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.ppt" />
+ <include name="*.content" />
+ <include name="*.meta" />
+ </fileset>
+ </copy>
+
+ <!-- ================================================================== -->
+ <!-- Run unit tests -->
+ <!-- ================================================================== -->
+ <target name="test" depends="compile-test, deploy" if="test.available">
+ <echo message="Testing plugin: ${name}"/>
+
+ <junit printsummary="yes" haltonfailure="no" fork="yes"
+ errorProperty="tests.failed" failureProperty="tests.failed">
+ <sysproperty key="test.data" value="${build.test}/data"/>
+ <sysproperty key="test.input" value="${root}/data"/>
+ <classpath refid="test.classpath"/>
+ <formatter type="plain" />
+ <batchtest todir="${build.test}" unless="testcase">
+ <fileset dir="${src.test}"
+ includes="**/AllTests.java" excludes="**/${test.exclude}.java" />
+ </batchtest>
+ <batchtest todir="${build.test}" if="testcase">
+ <fileset dir="${src.test}" includes="**/${testcase}.java"/>
+ </batchtest>
+ </junit>
+
+ <fail if="tests.failed">Tests failed!</fail>
+
+ </target>
+
+</project>
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar?rev=267226&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-2.5.1-final-20040804.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar?rev=267226&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/lib/poi-scratchpad-2.5.1-final-20040804.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml Fri Sep 2 08:55:47 2005
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-mspowerpoint"
+ name="MSPowerPoint Parse Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parse-mspowerpoint.jar">
+ <export name="*"/>
+ </library>
+ <library name="poi-2.5.1-final-20040804.jar"/>
+ <library name="poi-scratchpad-2.5.1-final-20040804.jar"/>
+ </runtime>
+
+ <extension id="net.nutch.parse.mspowerpoint"
+ name="MSPowerPointParse"
+ point="org.apache.nutch.parse.Parser">
+ <implementation id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser"
+ class="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser"
+ contentType="application/vnd.ms-powerpoint"
+ pathSuffix=""/>
+ </extension>
+
+</plugin>
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt Fri Sep 2 08:55:47 2005
@@ -0,0 +1,6 @@
+Directory to store PowerPoint samples.
+If ther is a file "test.ppt" the testcase searches for a reference file "test.ppt.content"
+to compare the extracted content.
+Additionaly a file "test.ppt.meta" could be stored here to compair the meta information.
+
+-- Stephan Strittmatter
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/README.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt?rev=267226&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.content Fri Sep 2 08:55:47 2005
@@ -0,0 +1,19 @@
+
+Nutch Parser Test
+My initial test file for the PowerPoint parser of nutch
+Second page
+Test
+Of
+PowerPoint
+Extraction
+Some Unicode
+I do not know the content and I can not read it, just gathered from other ppt-files:
+ÐÑезенÑаÑÐ¸Ñ PowerPoint
+
+Stephan Strittmatter │ï¾ï¿¨ï¾ï¿¨ï¾ï¿¨ï¾ Some Notes
+│ï¾ï¿¨ï¾Textmasterformate durch Klicken bearbeiten
+Zweite Ebene
+Dritte Ebene
+Vierte Ebene
+Fnfte Ebene
+│ï¾ï¿¨ï¾ Notes of second page
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/sample/test.ppt.meta Fri Sep 2 08:55:47 2005
@@ -0,0 +1,2 @@
+Title: Test
+Outlinks: 0
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,416 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.poi.hdf.extractor.Utils;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+
+/**
+ * Listener to read the content of PowerPoint file and transfere it to the
+ * passed <code>StringBuffer</code>.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ *
+ */
+class ContentReaderListener implements POIFSReaderListener {
+
+ private static final Logger LOG = LogFormatter
+ .getLogger(ContentReaderListener.class.getName());
+
+ /** Buffer holding the content of the file */
+ protected final transient StringBuffer buf;
+
+ /**
+ * Constructs Listener to get content of PowerPoint file.
+ *
+ * @param content
+ * StringBuffer refereing the content of the PowerPoint file.
+ */
+ public ContentReaderListener(final StringBuffer content) {
+ this.buf = content;
+ }
+
+ /**
+ * Reads the internal PowerPoint document stream.
+ *
+ * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
+ */
+ public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
+
+ if (event == null || event.getName() == null
+ || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
+ LOG.warning("Stream not processed. It is not a PowerPoint document: : "
+ + event.getName());
+ return;
+ }
+
+ try {
+ final DocumentInputStream dis = event.getStream();
+ final byte pptdata[] = new byte[dis.available()];
+ dis.read(pptdata, 0, dis.available());
+ int offset = 0;
+ long offsetPD = 0;
+
+ /*
+ * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
+ * in all PlaceHolders to hold PPTClientTextBox objects for mapping into
+ * Slide Objects
+ */
+ Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
+ * <Long,
+ * TextBox>
+ */();
+ // Traverse ByteArray to identiy edit paths of ClientTextBoxes
+ long n = pptdata.length - 20;
+ for (long i = 0; i < n; i++) {
+
+ final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
+ // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
+
+ if (PPTConstants.PPT_ATOM_USEREDIT == type) {
+ /*
+ * Checking the Record Header (UserEditAtom)
+ */
+ // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
+ // final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
+ offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
+ offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
+
+ /*
+ * Call to extract ClientTextBox text in each UserEditAtom
+ */
+ containerTextBox = extractTextBoxes(containerTextBox, offset,
+ pptdata, offsetPD);
+ } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
+ // LOG.finest("PPT_DRAWINGGROUP_ATOM ignored: " + type);
+ } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
+ // LOG.finest("PPT_TEXTBYTE_ATOM ignored: " + type);
+ } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
+ // LOG.finest("PPT_TEXTCHAR_ATOM ignored: " + type);
+ } else {
+ // no action
+ // LOG.finest("type not handled: " + type);
+ }
+ }
+
+ final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
+ offsetPD);
+
+ if (slides.size() == 0) {
+ LOG.info("No slides extracted!");
+
+ } else {
+ Slide slide = (Slide) slides.get(slides.size() - 1);
+
+ for (Enumeration enumeration = containerTextBox.elements(); enumeration
+ .hasMoreElements();) {
+ final TextBox textBox = (TextBox) enumeration.nextElement();
+ slide.addContent(textBox.getContent());
+ }
+
+ /*
+ * Merging TextBox data with Slide Data Printing the text from Slides
+ * vector object.
+ */
+ List scontent;
+ for (int i = 0; i < slides.size(); i++) {
+ slide = (Slide) slides.get(i);
+ scontent = slide.getContent();
+ String contentText;
+
+ for (int j = 0; j < scontent.size(); j++) {
+ contentText = scontent.get(j).toString();
+ this.buf.append(contentText);
+
+ // to avoid concatinated words we add a blank additional
+ if (contentText.length() > 0
+ && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
+ this.buf.append(" ");
+ }
+ }
+ }
+ }
+ } catch (Throwable ex) {
+ // because of not killing complete crawling all Throwables are catched.
+ LOG.throwing(this.getClass().getName(), "processPOIFSReaderEvent", ex);
+ }
+ }
+
+ /**
+ * Extracts the client text boxes of a slide.
+ *
+ * @param containerTextBox
+ * @param offset
+ * @param pptdata
+ * @param offsetPD
+ * @return Hashtable
+ * @see TextBox
+ */
+ protected Hashtable/* <Long, TextBox> */extractTextBoxes(
+ final Hashtable/* <Long, TextBox> */containerTextBox, final int offset,
+ final byte[] pptdata, final long offsetPD) {
+
+ // To hold temporary data
+ FilteredStringWriter outStream = new FilteredStringWriter();
+
+ TextBox textBox;
+
+ // Traversing the bytearray up to Presist directory position
+ for (int i = offset; i < offsetPD - 20; i++) {
+ try {
+ // Record info
+ // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
+ // Record Type
+ final long recordType = LittleEndian.getUShort(pptdata, i + 2);
+ // Record Size
+ final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
+
+ if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+ /*
+ * Record type is of Drawing Group
+ */
+
+ // Total number of objects
+ // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
+ // 8);
+ // currentID = Group ID+number of objects
+ long currentID = LittleEndian.getInt(pptdata, i + 12);
+ currentID = ((int) (currentID / 1024)) * 1024;
+
+ if (currentID == PPTConstants.PPT_MASTERSLIDE) {
+ // Ignore Master Slide objects
+ LOG.finest("Ignore master slide.");
+ i++;
+ continue;
+ }
+
+ // Check for the ClientTextBox GroupID existence
+ if (containerTextBox.containsKey(new Long(currentID))) {
+ // If exists get Client Textbox Group
+ textBox = (TextBox) containerTextBox.get(new Long(currentID));
+ textBox.setContent("");
+
+ } else {
+ textBox = new TextBox(currentID);
+ containerTextBox.put(new Long(currentID), textBox);
+ }
+
+ /*
+ * Iterating the bytearray for TextCharAtoms and TextBytesAtom
+ */
+ if ((offsetPD - 20) != recordSize) {
+ // TODO something wrong? Probably an OLE-Object, which we ignore.
+ LOG.finer("offsetPD - 20=" + (offsetPD - 20) + " recordsize="
+ + recordSize);
+ } else {
+ for (int startPos = i + 8; startPos < offsetPD - 20
+ && startPos < recordSize; startPos++) { // && startPos <
+ // recordSize??
+ try {
+
+ // Record info
+ // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);
+
+ // Record Type
+ final long ntype = LittleEndian
+ .getUShort(pptdata, startPos + 2);
+
+ // Record size
+ // Note that the size doesn't include the 8 byte atom header
+ final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);
+
+ if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+ /*
+ * Break the loop if next GroupID found
+ */
+ i = startPos - 1;
+ break;
+ } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
+ // TextByteAtom record
+ outStream = new FilteredStringWriter();
+ long ii = 0;
+ for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
+ // For loop to changed to a function
+ // if ((ii + 2) >= pptdata.length)
+ // break; // FIXME
+ outStream.write((char) (pptdata[(int) ii + 2]));
+ }
+
+ // Setting the identified text for Current
+ // groupID
+ textBox.setContent(textBox.getContent()
+ + outStream.toString());
+
+ } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
+ // TextCharAtom record
+
+ final String strTempContent = new String(pptdata,
+ (int) startPos + 6, (int) (nsize) + 2);
+ final byte bytes[] = strTempContent.getBytes();
+ if (true) {
+ // FIXME my version
+ outStream = new FilteredStringWriter();
+ for (int ii = 0; ii < bytes.length - 1; ii += 2) {
+ // For loop to changed to a function
+ outStream.write((char) (pptdata[ii + 2]));
+ }
+ textBox.setContent(textBox.getContent()
+ + outStream.toString());
+ } else {
+ // this version is used within POI
+ String text = StringUtil.getFromCompressedUnicode(bytes, 0,
+ bytes.length);
+ textBox.setContent(textBox.getContent() + text);
+ }
+
+ } else {
+ // ignored
+ // LOG.finest("Ignored atom type: " + type);
+ }
+ } catch (Throwable e) {
+ LOG.throwing(this.getClass().getName(), "extractTextBoxes", e);
+ break;
+ }
+ }
+ }
+ } else {
+ /*
+ * Record type is ignored
+ */
+ // LOG.finest("Ignored record type: " + type);
+ }
+ } catch (Throwable ee) {
+ LOG.throwing(this.getClass().getName(), "extractClientTextBoxes", ee);
+ break;
+ }
+ }
+ return containerTextBox;
+ }
+
+ /**
+ * Returns the Powerpoint <code>Slide</code> s of document as vector.
+ *
+ * @param offset
+ * @param pptdata
+ * @param offsetPD
+ * @return Vector of the powerpoint slides. Contains
+ * <code>{@link Slide Slide}</code>
+ * @see Slide
+ */
+ protected List /* <Slide> */extractSlides(final long offset,
+ final byte[] pptdata, final long offsetPD) {
+
+ int sNum = 0;
+
+ // List of all slides found
+ final List/* <Slide> */slides = new Vector/* <Slide> */();
+
+ // current slide data
+ Slide currentSlide = null;
+
+ // To store data found in TextCharAtoms and TextBytesAtoms
+ FilteredStringWriter outStream;
+
+ for (long i = offset; i < pptdata.length - 20; i++) {
+
+ final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
+ final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
+ final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
+
+ if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
+ /*
+ * TextByteAtom record
+ */
+ outStream = new FilteredStringWriter();
+
+ for (long ii = i + 6; (ii <= i + 6 + atomSize)
+ && (ii + 2 < pptdata.length); ii++) {
+ try {
+ // if(ii+2 >= pptdata.length) break; //FIXME
+ byte value = pptdata[(int) ii + 2];
+ outStream.write(value);
+ } catch (ArrayIndexOutOfBoundsException ex) {
+ LOG.finest("size=" + pptdata.length);
+ LOG.throwing(this.getClass().getName(), "extractSlides", ex);
+ }
+ }
+
+ // Setting the identified text for Current Slide
+ if (currentSlide != null) {
+ currentSlide.addContent(outStream.toString());
+ }
+
+ } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
+ /*
+ * TextCharAtom record
+ */
+ outStream = new FilteredStringWriter();
+ final String strTempContent = new String(pptdata, (int) i + 6,
+ (int) (atomSize) + 2);
+ final byte bytes[] = strTempContent.getBytes();
+
+ for (int ii = 0; ii < bytes.length - 1; ii += 2) {
+ outStream.write(Utils.getUnicodeCharacter(bytes, ii));
+ }
+
+ // Setting the identified text for Current Slide
+ if (currentSlide != null) {
+ currentSlide.addContent(outStream.toString());
+ }
+
+ } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
+ /*
+ * SlidePresistAtom Record
+ */
+ if (sNum != 0) {
+ outStream = new FilteredStringWriter();
+
+ final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20);
+
+ currentSlide = new Slide(slideID);
+ // currentSlide.addContent(outStream.toString());
+ slides.add(currentSlide);
+ }
+ sNum++;
+ } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+ /*
+ * Diagram records are ignored
+ */
+ LOG.finest("Drawing Groups are ignored.");
+ break;
+ } else {
+ // ignored
+ // LOG.finest("Unhandled atomType: " + atomType);
+ }
+ }
+
+ return slides;
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.StringWriter;
+
+/**
+ * Writes to optimize ASCII output. Not needed chars are filtered (ignored).
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @create 19.01.2005
+ */
+public class FilteredStringWriter extends StringWriter {
+
+ /**
+ * @see StringWriter#StringWriter()
+ */
+ public FilteredStringWriter() {
+ super();
+ }
+
+ /**
+ * @param initialSize
+ * @see StringWriter#StringWriter(int)
+ */
+ public FilteredStringWriter(final int initialSize) {
+ super(initialSize);
+ }
+
+ /**
+ * Chars which are not useful for Nutch indexing are filtered (ignored) on
+ * writing to the writer.
+ *
+ * @see java.io.Writer#write(int)
+ */
+ public void write(final int ch) {
+ if (ch == '\r') {
+ // PowerPoint seems to store files with \r as the line break
+ // -> unify to platform specific format
+ super.write(System.getProperty("line.separator"));
+ } else if (ch == 0) {
+ super.write(System.getProperty("line.separator"));
+ } else if (ch == '\b') {
+ // ignore it
+ } else if (Character.isISOControl((char) ch)) {
+ // replace by blank
+ // super.write(' ');
+ } else if (Character.isWhitespace((char) ch)) {
+ // unify to blank
+ super.write(' ');
+ } else {
+ super.write(ch);
+ }
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,189 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ * Nutch-Parser for parsing MS PowerPoint slides ( mime type:
+ * application/vnd.ms-powerpoint).
+ * <p>
+ * It is based on org.apache.poi.*.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @see <a href="http://jakarta.apache.org/poi">Jakarta POI</a>
+ * @version 1.0
+ */
+public class MSPowerPointParser implements Parser {
+
+ /** associated Mime type for PowerPoint files (application/vnd.ms-powerpoint) */
+ public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
+
+ private static final Logger LOG = LogFormatter
+ .getLogger(MSPowerPointParser.class.getName());
+
+ /**
+ *
+ */
+ public MSPowerPointParser() {
+ }
+
+ /**
+ *
+ */
+ public MSPowerPointParser(String fileName) {
+
+ }
+
+ /**
+ * Main for testing. Pass a ppt-file as argument
+ *
+ * @param args
+ */
+ public static void main(String args[]) {
+ if (args.length < 1) {
+ System.err.println("Useage:");
+ System.err.println("\tMSPowerPointParser <file>");
+ System.exit(1);
+ }
+
+ String file = args[0];
+ MSPowerPointParser ppe = new MSPowerPointParser();
+
+ byte[] raw = getRawBytes(new File(file));
+
+ Properties prop = new Properties();
+ prop.setProperty("Content-Length", "" + raw.length);
+
+ Content content = new Content(file, file, raw, MIME_TYPE, prop);
+
+ System.out.println(ppe.getParse(content).getText());
+ }
+
+ /**
+ * Parses the MS PowerPoint file.
+ *
+ * @see org.apache.nutch.parse.Parser#getParse(Content)
+ */
+ public Parse getParse(final Content content) {
+
+ // check that contentType is one we can handle
+ final String contentType = content.getContentType();
+
+ if (contentType != null && !contentType.startsWith(MIME_TYPE)) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_INVALID_FORMAT, "Content-Type is not ["
+ + MIME_TYPE + "] was: " + contentType).getEmptyParse();
+ }
+
+ String plainText = null;
+ String title = null;
+ Outlink[] outlinks = null;
+ Properties properties = null;
+
+ try {
+ final String contentLen = content.get("Content-Length");
+ final byte[] raw = content.getContent();
+
+ if (contentLen != null && raw.length != Integer.parseInt(contentLen)) {
+ return new ParseStatus(
+ ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at "
+ + raw.length
+ + " bytes. Please increase <protocol>.content.limit at nutch-default.xml. "
+ + "Parser can't handle incomplete PowerPoint files.")
+ .getEmptyParse();
+ }
+
+ final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream(
+ raw));
+
+ plainText = extractor.getText();
+ properties = extractor.getProperties();
+ outlinks = this.getOutlinks(plainText, content.getUrl());
+
+ } catch (Exception e) {
+ LOG.throwing(this.getClass().getName(), "getParse", e);
+ return new ParseStatus(e).getEmptyParse();
+ }
+
+ // collect meta data
+ final Properties metadata = new Properties();
+ metadata.putAll(content.getMetadata()); // copy through
+
+ if (properties != null) {
+ title = properties.getProperty("Title");
+ properties.remove("Title");
+ metadata.putAll(properties);
+ }
+
+ if (plainText == null) {
+ plainText = "";
+ }
+
+ if (title == null) {
+ title = "";
+ }
+
+ final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+ final ParseData parseData = new ParseData(status, title, outlinks, metadata);
+
+ LOG.finest("PowerPoint file parsed sucessful.");
+ return new ParseImpl(plainText, parseData);
+ }
+
+ /**
+ * Collect outlinks of document.
+ *
+ * @param plainText
+ *
+ * @return Array of links within the PowerPoint file
+ */
+ protected Outlink[] getOutlinks(String plainText, String anchor) {
+ return OutlinkExtractor.getOutlinks(plainText, anchor);
+ }
+
+ private final static byte[] getRawBytes(File f) {
+ try {
+ if (!f.exists())
+ return null;
+ FileInputStream fin = new FileInputStream(f);
+ byte[] buffer = new byte[(int) f.length()];
+ fin.read(buffer);
+ fin.close();
+ return buffer;
+ } catch (Exception err) {
+ err.printStackTrace();
+ return null;
+ }
+
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,228 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s
+ * / URLs from plain text using Regular Expressions.
+ *
+ * @see <a
+ * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
+ * of different regexp-Implementations </a>
+ * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs
+ * </a>
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+ private static final Logger LOG = LogFormatter
+ .getLogger(OutlinkExtractor.class.getName());
+
+ /**
+ * Regex pattern to get URLs within a plain text.
+ *
+ * @see <a
+ * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+ * </a>
+ */
+ private static final String URL_PATTERN =
+ "([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)";
+
+ /**
+ * Extracts <code>Outlink</code> from given plain text.
+ *
+ * @param plainText the plain text from wich URLs should be extracted.
+ *
+ * @return Array of <code>Outlink</code>s within found in plainText
+ */
+ public static Outlink[] getOutlinks(final String plainText) {
+ return OutlinkExtractor.getOutlinks(plainText, "");
+ }
+
+ /**
+ * Extracts <code>Outlink</code> from given plain text and adds anchor
+ * to the extracted <code>Outlink</code>s
+ *
+ * @param plainText the plain text from wich URLs should be extracted.
+ * @param anchor the anchor of the url
+ *
+ * @return Array of <code>Outlink</code>s within found in plainText
+ */
+ public static Outlink[] getOutlinks(final String plainText, String anchor) {
+
+ final List outlinks = new ArrayList();
+
+ try {
+ final PatternCompiler cp = new Perl5Compiler();
+ final Pattern pattern = cp.compile(URL_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final PatternMatcher matcher = new Perl5Matcher();
+
+ final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+ MatchResult result;
+ String url;
+
+ //loop the matches
+ while (matcher.contains(input, pattern)) {
+ result = matcher.getMatch();
+ url = result.group(0);
+ outlinks.add(new Outlink(url, anchor));
+ }
+ } catch (Exception ex) {
+ // if it is a malformed URL we just throw it away and continue with
+ // extraction.
+ LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex);
+ }
+
+ final Outlink[] retval;
+
+ //create array of the Outlinks
+ if (outlinks != null && outlinks.size() > 0) {
+ retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ } else {
+ retval = new Outlink[0];
+ }
+
+ return retval;
+ }
+
+
+ /**
+ * Extracts outlinks from a plain text. <br />
+ * This Method takes the Jakarta Regexp API.
+ *
+ * @param plainText
+ *
+ * @return Array of <code>Outlink</code> s within found in plainText
+ * @deprecated only for tests
+ */
+ private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
+
+ throw new UnsupportedOperationException(
+ "Implementation commented out. Please uncomment to use it.");
+
+ // final List outlinks = new ArrayList();
+ // String url;
+ // Outlink link;
+ //
+ // RE re = new RE(URL_PATTERN);
+ //
+ // int pos = 0;
+ //
+ // while (re.match(plainText, pos)) {
+ //
+ // url = re.getParen(0);
+ //
+ // LOG.finest("Extracted url: " + url);
+ //
+ // try {
+ //
+ // link = new Outlink(url, null);
+ // outlinks.add(link);
+ //
+ // } catch (MalformedURLException ex) {
+ // // if it is a malformed URL we just throw it away and continue with
+ // // extraction.
+ // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+ // }
+ //
+ // pos = re.getParenEnd(0);
+ // }
+ //
+ // final Outlink[] retval;
+ //
+ // if (pos > 0) {
+ // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ // } else {
+ // retval = new Outlink[0];
+ // }
+ //
+ // return retval;
+
+ }
+
+ /**
+ * Extracts outlinks from a plain text.
+ * </p>
+ * This Method takes the JDK5 Regexp API.
+ *
+ * @param plainText
+ *
+ * @return Array of <code>Outlink</code> s within found in plainText
+ * @deprecated only for tests
+ */
+ private Outlink[] getOutlinksJDK5Impl(final String plainText) {
+
+ throw new UnsupportedOperationException(
+ "Implementation commented out. Please uncomment to use it.");
+
+ // final List outlinks = new ArrayList();
+ // String url;
+ // Outlink link;
+ //
+ // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+ // final RE re = new RE(urlPattern);
+ //
+ // int pos = 0;
+ //
+ // while (re.match(plainText, pos)) {
+ //
+ // url = re.getParen(0);
+ //
+ // try {
+ //
+ // link = new Outlink(url, null);
+ // outlinks.add(link);
+ // } catch (MalformedURLException ex) {
+ // // if it is a malformed URL we just throw it away and continue with
+ // // extraction.
+ // LOG.throwing(this.getClass().getName(), "getOutlinks", ex);
+ // }
+ //
+ // pos = re.getParenEnd(0);
+ // }
+ //
+ // final Outlink[] retval;
+ //
+ // if (pos > 0) {
+ // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ // } else {
+ // retval = new Outlink[0];
+ // }
+ //
+ // return retval;
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+/**
+ * Package protected class for the required internal MS PowerPoint constants.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+class PPTConstants {
+
+ /** ID of master slide */
+ public static final long PPT_MASTERSLIDE = 1024L;
+
+ public static long PPT_ATOM_SLIDE = 1007l;
+
+ /** ATOM ID of notes */
+ public static final long PPT_ATOM_NOTES = 1009L;
+
+ /** ATOM ID of persistend slide */
+ public static final long PPT_ATOM_SLIDEPERSISTANT = 1011L;
+
+ /** ATOM ID of text char area. Holds text in byte swapped unicode form. */
+ public static final long PPT_ATOM_TEXTCHAR = 4000L;
+
+ /** ATOM ID of text byte area. Holds text in ascii form */
+ public static final long PPT_ATOM_TEXTBYTE = 4008L;
+
+ /** ATOM ID of user edit area */
+ public static final long PPT_ATOM_USEREDIT = 4085L;
+
+ /** ATOM ID of drawing group area */
+ public static final long PPT_ATOM_DRAWINGGROUP = 61448L;
+
+ /** Name for PowerPoint Documents within the file */
+ public static final String POWERPOINT_DOCUMENT = "PowerPoint Document";
+
+
+
+ /**
+ * Protected constructor to prevent instantiation.
+ */
+ protected PPTConstants() {
+ // nothing
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Converts the Powerpoint document content to plain text.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+
+public class PPTExtractor {
+
+ private static final Logger LOG = LogFormatter.getLogger(PPTExtractor.class
+ .getName());
+
+ /** Parsed plain Powerpoint Text */
+ private final transient StringBuffer contentBuf;
+
+ private final PropertiesBroker propertiesBroker;
+
+ private final POIFSReader poireader;
+
+ /**
+ * Constructor that takes a PowerPoint file as <code>InputStream</code> to
+ * parse it.
+ *
+ * @param in
+ * <code>InputStream</code> containing the PowerPoint file
+ * @throws PowerPointDocumentException
+ * thrown if parsing failed
+ */
+ public PPTExtractor(final InputStream in) throws PowerPointDocumentException {
+ this.poireader = new POIFSReader();
+ this.propertiesBroker = new PropertiesBroker();
+ this.contentBuf = new StringBuffer();
+
+ this.init(in);
+ }
+
+ /**
+ * Get the PowerPoint content text as plain text
+ *
+ * @return String the content text
+ */
+ public String getText() {
+ return this.contentBuf.toString();
+ }
+
+ /**
+ * Get the <code>Properties</code> of the PowerPoint document.
+ *
+ * @return the properties of the document
+ */
+ public Properties getProperties() {
+ return this.propertiesBroker.getProperties();
+ }
+
+ /**
+ * @param input
+ * @throws PowerPointDocumentException
+ */
+ private void init(final InputStream input) throws PowerPointDocumentException {
+ // register listener for SummaryInformation
+ this.poireader.registerListener(new PropertiesReaderListener(
+ this.propertiesBroker), SummaryInformation.DEFAULT_STREAM_NAME);
+
+ // register listener for PPT-document content
+ this.poireader.registerListener(new ContentReaderListener(this.contentBuf),
+ PPTConstants.POWERPOINT_DOCUMENT);
+
+ try {
+ input.reset();
+ if (input.available() > 0) {
+ this.poireader.read(input);
+ } else {
+ LOG.warning("Input <=0 :" + input.available());
+ }
+ } catch (IOException e) {
+ throw new PowerPointDocumentException(e);
+ }
+ }
+
+ /**
+ * The PropertiesBroker
+ *
+ * @author Stephan Strittmatter
+ * @version 1.0
+ */
+ static class PropertiesBroker {
+
+ private final static int TIMEOUT = 2 * 1000;
+
+ private Properties properties = null;
+
+ /**
+ * Get the collected properties.
+ *
+ * @return properties of the PowerPoint file
+ */
+ public synchronized Properties getProperties() {
+
+ final long start = new Date().getTime();
+ long now = start;
+
+ while (this.properties == null && now - start < TIMEOUT) {
+ try {
+ wait(TIMEOUT / 10);
+ } catch (InterruptedException e) {
+ }
+ now = new Date().getTime();
+ }
+
+ notifyAll();
+
+ return this.properties;
+ }
+
+ /**
+ *
+ * @param properties
+ */
+ public synchronized void setProperties(Properties properties) {
+ this.properties = properties;
+ notifyAll();
+ }
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.IOException;
+
+/**
+ * Exception class used for catching the runtime exceptions for the Powerpoint
+ * slides.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+
+public class PowerPointDocumentException extends Exception {
+
+ /** Comment for <code>serialVersionUID</code> */
+ private static final long serialVersionUID = 3256438093031487028L;
+
+ /**
+ * A constructor that builds the Exception object
+ *
+ * @param message
+ */
+ public PowerPointDocumentException(String message) {
+ super(message);
+ }
+
+ /**
+ * A constructor that builds the Exception object
+ *
+ * @param message
+ * @param cause
+ */
+ public PowerPointDocumentException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ /**
+ * @param e
+ */
+ public PowerPointDocumentException(Exception e) {
+ super(e);
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,130 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Properties;
+import java.util.TimeZone;
+import java.util.logging.Logger;
+
+import org.apache.nutch.parse.mspowerpoint.PPTExtractor.PropertiesBroker;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+/**
+ * Listener for retrieving the properties of document.
+ *
+ * @author Stephan Strittmatter
+ *
+ * @version 1.0
+ */
+class PropertiesReaderListener implements POIFSReaderListener {
+ private static final Logger LOG = LogFormatter
+ .getLogger(PropertiesReaderListener.class.getName());
+
+ private static final String TIME_ZONE_ID = "GTM";
+
+ private final transient PropertiesBroker propertiesBroker;
+
+ /** DateFormatter for transfereing dates do strings. */
+ private final transient SimpleDateFormat dateFormatter = new SimpleDateFormat();
+
+ /** Properties of the powerpoint Document */
+ private final transient Properties properties;
+
+ /**
+ * Listener for retrieving the properties of document.
+ *
+ * @param propertiesBroker
+ */
+ public PropertiesReaderListener(final PropertiesBroker propertiesBroker) {
+ this.propertiesBroker = propertiesBroker;
+ this.dateFormatter.setTimeZone(TimeZone.getTimeZone(TIME_ZONE_ID));
+ this.properties = new Properties();
+ }
+
+ /**
+ * Process the properties of the document and adds them to property object.
+ *
+ * @param event
+ * contains the document to be parsed
+ */
+ public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+
+ if (event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
+
+ try {
+ final SummaryInformation sInfo = (SummaryInformation) PropertySetFactory
+ .create(event.getStream());
+
+ addProperty("Title", sInfo.getTitle());
+ addProperty("Subject", sInfo.getSubject());
+ addProperty("Keywords", sInfo.getKeywords());
+ addProperty("Comments", sInfo.getComments());
+ addProperty("Author", sInfo.getAuthor());
+ addProperty("Last-Author", sInfo.getLastAuthor());
+
+ /*
+ * already provided by nutch
+ */
+ // addProperty("Saved-Date", si.getLastSaveDateTime());
+ /*
+ * following properties are not required for indexing/searching
+ */
+ // addProperty("Word-Count", si.getWordCount());
+ // addProperty("Page-Count", si.getPageCount());
+ // addProperty("Character Count", si.getCharCount());
+ // addProperty("Revision-Number", si.getRevNumber());
+ // addProperty("Creation-Date", si.getEditTime());
+ // addProperty("Edit-Time", si.getEditTime());
+ // addProperty("Last-Printed", si.getLastPrinted());
+ // addProperty("Template", si.getTemplate());
+ // addProperty("Security", si.getSecurity());
+ // addProperty("Application-Name", si.getApplicationName());
+ } catch (Exception ex) {
+ LOG.throwing(this.getClass().getName(), "processPOIFSReaderEvent", ex);
+ }
+
+ } else {
+ LOG.warning("Wrong stream not processed: " + event.getName());
+ }
+
+ this.propertiesBroker.setProperties(this.properties);
+ }
+
+ protected void addProperty(final String name, final long value) {
+ if (value != 0) {
+ this.properties.setProperty(name, String.valueOf(value));
+ }
+ }
+
+ protected void addProperty(final String name, final String value) {
+ if (value != null) {
+ this.properties.setProperty(name, value);
+ }
+ }
+
+ protected void addProperty(final String name, final Date value) {
+ if (value != null) {
+ this.properties.setProperty(name, this.dateFormatter.format(value));
+ }
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.util.List;
+import java.util.Vector;
+
+/**
+ * Package protected class for a MS Powerpoint slide.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+class Slide {
+
+
+ /** Holds the Slide Number */
+ protected transient final long slideNumber;
+
+ /** Holds the contents of the Slide */
+ protected transient final List/* <String> */contents;
+
+ /**
+ * Initialise the Object for holding the contents of Power Point Slide
+ *
+ * @param number
+ */
+ public Slide(long number) {
+ this.slideNumber = number;
+ this.contents = new Vector/* <String> */();
+ }
+
+ /**
+ * Add the Content of Slide to this Object
+ *
+ * @param content
+ */
+ public void addContent(String content) {
+ this.contents.add(content);
+ }
+
+ /**
+ * returns the contents of slide as a vector object
+ *
+ * @return Vector
+ */
+ public List getContent() {
+ return this.contents;
+ }
+
+ /**
+ * returns the slide value
+ *
+ * @return long
+ */
+ public long getSlideNumber() {
+ return this.slideNumber;
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/Slide.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.mspowerpoint;
+
+/**
+ * Package protected class for the MS Powerpoint TextBox content
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+class TextBox {
+
+ /**
+ * Current id of a text box
+ */
+ protected transient final long currentID;
+
+ /**
+ * Content of text box
+ */
+ protected String content;
+
+ /**
+ * Instantiates the text box object
+ *
+ * @param textBoxId
+ * id of text box
+ */
+ public TextBox(final long textBoxId) {
+ this.currentID = textBoxId;
+ this.content = "";
+ }
+
+ /**
+ * Instantiates the text box object
+ *
+ * @param textBoxId
+ * id of text box
+ * @param content
+ * content of text box
+ */
+ public TextBox(final long textBoxId, final String content) {
+ this.currentID = textBoxId;
+ this.content = content;
+ }
+
+ /**
+ * Sets the content of the text box
+ *
+ * @param content
+ * content of text Box
+ */
+ public void setContent(final String content) {
+ this.content = content;
+ }
+
+ /**
+ * Returns the content of the text box
+ *
+ * @return content of text box
+ */
+ public String getContent() {
+ return this.content;
+ }
+
+ /**
+ * Returns the current text box id
+ *
+ * @return long
+ */
+ public long getCurrentId() {
+ return this.currentID;
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/TextBox.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html Fri Sep 2 08:55:47 2005
@@ -0,0 +1,35 @@
+<!--
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<html>
+ <head>
+ <title>Microsoft PowerPoint document parsing plugin.</title>
+ </head>
+ <body>
+ <p>A Microsoft © PowerPoint document parsing plugin.</p>
+ <p>This package relies on <a
+ href="http://www.apache.org/poi/index.html">POI</a>.</p>
+ <p> Implementation based on sources found at <a
+ href="http://groups.google.com/groups?selm=a4f8800541bc694d5af7dabb35e83b72%40localhost.talkaboutsoftware.com">Google
+ Groups </a>. It can also be found at <a
+ href="http://www.mail-archive.com/poi-user@jakarta.apache.org/msg04809.html">http://www.mail-archive.com/poi-user@jakarta.apache.org/msg04809.html</a>
+ written by Hari Shanker and Sudhakar Chavali. Thanks for the basic
+ work!</p>
+ <p>I changed these classes to support also Unicode content and
+ optimized them for Nuch.</p>
+ </body>
+</html>
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.File;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+public class AllTests {
+
+ /** This system property is defined in ./src/plugin/build-plugin.xml */
+ private final static String SAMPLE_DIR = System.getProperty("test.data",
+ "build/parse-mspowerpoint/test/data");
+
+ /**
+ * Main to run the test
+ *
+ * @param args
+ * not required
+ */
+ public static void main(String[] args) {
+ junit.textui.TestRunner.run(AllTests.suite());
+ }
+
+ /**
+ * @return Test for the PowerPoint plugin
+ */
+ public static Test suite() {
+ final TestSuite suite = new TestSuite(
+ "Test for org.apache.nutch.parse.mspowerpoint");
+
+ System.out.println("Testing with ppt-files of dir: " + SAMPLE_DIR);
+
+ final File sampleDir = new File(SAMPLE_DIR);
+
+ //find all ppt-files in the test-directory
+ final FileExtensionFilter pptFilter = new FileExtensionFilter(".ppt");
+ final String[] pptFiles = sampleDir.list(pptFilter);
+
+ if(pptFiles== null)
+ {
+ throw new IllegalArgumentException(SAMPLE_DIR + " does not contain any files: " + pptFilter);
+ }
+ TestSuite suiteAllFiles;
+
+
+ // iterate over all ppt-files which are found and test against them
+ for (int i = 0; i < pptFiles.length; i++) {
+ //test the content...
+ suiteAllFiles = new TestSuite("Testing file [" + pptFiles[i] + "]");
+ TestCase test = new TestMSPowerPointParser(new File(pptFiles[i]));
+ test.setName("testContent");
+
+ suiteAllFiles.addTest(test);
+
+ //..then the properties
+ TestCase test2 = new TestMSPowerPointParser(new File(pptFiles[i]));
+ test2.setName("testMeta");
+ suiteAllFiles.addTest(test2);
+
+ suite.addTest(suiteAllFiles);
+ }
+
+ return suite;
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/AllTests.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.File;
+import java.io.FilenameFilter;
+
+/**
+ * Helper class to filter for specific files to test them.
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+/* package protected */class FileExtensionFilter implements FilenameFilter {
+ private String ext = "*";
+
+ /**
+ * @param ext
+ */
+ public FileExtensionFilter(String ext) {
+ this.ext = ext;
+ }
+
+ /* (non-Javadoc)
+ * @see java.io.FilenameFilter#accept(java.io.File, java.lang.String)
+ */
+ public boolean accept(File dir, String name) {
+ if (name.endsWith(this.ext))
+ return true;
+ return false;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ // TODO Auto-generated method stub
+ return this.ext;
+ }
+}
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/FileExtensionFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Fri Sep 2 08:55:47 2005
@@ -0,0 +1,259 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.mspowerpoint;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.logging.Logger;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ * <p>
+ * Unit tests for MSPowerPointParser.
+ * </p>
+ * <p>
+ * Make sure sample files are copied to "test.data" as specified in
+ * ./src/plugin/parse-mspowerpoint/build.xml during plugin compilation. Check
+ * ./src/plugin/parse-mspowerpoint/sample/README.txt for what they are.
+ * </p>
+ *
+ * @author Stephan Strittmatter - http://www.sybit.de
+ *
+ * @version 1.0
+ */
+public class TestMSPowerPointParser extends TestCase {
+ private static final Logger LOG = LogFormatter
+ .getLogger(TestMSPowerPointParser.class.getName());
+
+ private static final String CHARSET = "UTF-8";
+
+ private final static String LINE_SEPARATOR = System.getProperty("line.separator");
+
+ /** This system property is defined in ./src/plugin/build-plugin.xml */
+ private final static String SAMPLE_DIR = System.getProperty("test.data",
+ "build/parse-mspowerpoint/test/data");
+
+ private final File sampleDir = new File(SAMPLE_DIR);
+
+ /**
+ * Wether dumping the extracted data to file for visual checks.
+ */
+ private final static boolean DUMP_TO_FILE = true;
+
+ private final File testFile;
+
+ private String urlString;
+
+ private Protocol protocol;
+
+ private Content content;
+
+ /**
+ *
+ * @param name
+ */
+ public TestMSPowerPointParser(String name) {
+ super(name);
+ this.testFile = new File(this.sampleDir, "test.ppt");
+ }
+
+ /**
+ * @param file
+ */
+ public TestMSPowerPointParser(File file) {
+ super();
+ this.testFile = file;
+ }
+
+ /**
+ * @see TestCase#setUp()
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ this.urlString = createUrl(this.testFile.getName());
+
+ System.out.println("Testing file: " + this.urlString + "...");
+ this.protocol = ProtocolFactory.getProtocol(this.urlString);
+ this.content = this.protocol.getProtocolOutput(this.urlString).getContent();
+ }
+
+ /**
+ * @see TestCase#tearDown()
+ */
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ }
+
+ /**
+ * Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
+ * parsable without exceptions.
+ *
+ * @see #SAMPLE_DIR
+ * @throws Exception
+ */
+ public void testContent() throws Exception {
+
+ Parser parser = ParserFactory.getParser(this.content.getContentType(),
+ this.urlString);
+ Parse parse = parser.getParse(this.content);
+
+ ParseData data = parse.getData();
+ String text = parse.getText();
+
+ assertTrue("No content extracted length ==0", text.length() > 0);
+
+ this.dumpToFile(this.testFile.getName(), data, text);
+
+ final FileExtensionFilter contentFilter = new FileExtensionFilter(
+ this.testFile.getName() + ".content");
+ final File[] contentFiles = this.sampleDir.listFiles(contentFilter);
+
+ if (contentFiles.length > 0) {
+ String testContent = this.fileToString(contentFiles[0]);
+
+ for (int i = 0; i < text.length(); i++) {
+ char parsedChar = text.charAt(i);
+ char testChar = testContent.charAt(i);
+ assertEquals("Wrong char at position [" + i + "]", "" + testChar, ""
+ + parsedChar);
+ }
+ } else {
+ LOG.info("Comparison file for Content not available: "
+ + this.testFile.getName() + ".content");
+ }
+ }
+
+ /**
+ * Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
+ * parsable without exceptions.
+ *
+ * @see #SAMPLE_DIR
+ * @throws Exception
+ */
+ public void testMeta() throws Exception {
+
+ Parser parser = ParserFactory.getParser(this.content.getContentType(),
+ this.urlString);
+ Parse parse = parser.getParse(this.content);
+
+ ParseData data = parse.getData();
+
+ final FileExtensionFilter titleFilter = new FileExtensionFilter(
+ this.testFile.getName() + ".meta");
+ final File[] titleFiles = this.sampleDir.listFiles(titleFilter);
+
+ if (titleFiles.length > 0) {
+ assertEquals("Document Title", this.fileToString(titleFiles[0]),
+ "Title: " + data.getTitle() + LINE_SEPARATOR +
+ "Outlinks: " + data.getOutlinks().length + LINE_SEPARATOR);
+ } else {
+ assertTrue("Document Title length ==0", data.getTitle().length() > 0);
+ LOG.info("Comparison file for Title not available: "
+ + this.testFile.getName() + ".meta");
+ }
+ }
+
+ /**
+ * create complete url
+ *
+ * @param fileName
+ * name of the file
+ * @return complete url.
+ */
+ private String createUrl(final String fileName) {
+ return "file:" + SAMPLE_DIR + "/" + fileName;
+ }
+
+ /**
+ * Dump the parsed data to a UTF-8 formatted file for visual checks.
+ *
+ * @param data
+ * @param text
+ * @param fileName
+ * @throws IOException
+ */
+ private void dumpToFile(final String fileName, final ParseData data,
+ final String text) throws IOException {
+ if (TestMSPowerPointParser.DUMP_TO_FILE) {
+
+ final File file = new File(fileName + ".txt");
+
+ final FileOutputStream fos = new FileOutputStream(file);
+ final OutputStreamWriter osw = new OutputStreamWriter(fos, CHARSET);
+
+ osw.write(data.toString());
+ osw.write(text);
+
+ osw.close();
+ fos.close();
+ }
+ }
+
+ /**
+ * Load the testfiles for comparison.
+ *
+ * @param file
+ * file to load
+ * @return UNF-8 encoded String content of file.
+ * @throws IOException
+ */
+ private String fileToString(final File file) throws IOException {
+ FileInputStream fis = null;
+ //InputStreamReader isr = null;
+ BufferedReader br = null;
+ final StringBuffer buf = new StringBuffer();
+
+ try {
+ fis = new FileInputStream(file);
+ br = new BufferedReader(new InputStreamReader(fis, CHARSET));
+
+ String line = br.readLine();
+ while (line != null) {
+ buf.append(line).append(LINE_SEPARATOR);
+ line = br.readLine();
+ }
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ if (fis != null) {
+ fis.close();
+ }
+ }
+
+ String val = buf.toString();
+
+ return val;
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html?rev=267226&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html (added)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html Fri Sep 2 08:55:47 2005
@@ -0,0 +1,44 @@
+<!--
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<html>
+ <head>
+ <title>Testing Package of Microsoft PowerPoint document parsing plugin.</title>
+ </head>
+ <body>
+ <h1>JUnit Testpackage for Microsoft © PowerPoint document parsing
+ plugin.</h1>
+ <p>The example ppt-files are located in the subdirectory
+ <code>src/plugins/parse-mspowerpoint/samples</code>. They are
+ copied by the ant-task to the directory
+ <code>src/plugins/parse-mspowerpoint/data</code> for testing.
+ Additionaly to the ppt-files, there could also be stored files with
+ the same name with postfix ".content" or ".title". If they exists,
+ the test classes are checking the extracted content against these
+ files.</p>
+ <h3>Example</h3>
+ <p>
+ <ul>
+ <li>PowerPoint file to test: <code>test.ppt</code></li>
+ <li>Reference file with content:
+ <code>test.ppt.content</code></li>
+ <li> Reference file with title and properties:
+ <code>test.ppt.title</code></li>
+ </ul>
+ </p>
+ </body>
+</html>
\ No newline at end of file
Propchange: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/package.html
------------------------------------------------------------------------------
svn:eol-style = native