You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/06 11:17:12 UTC

svn commit: r278951 - /lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/

Author: jerome
Date: Tue Sep  6 02:17:05 2005
New Revision: 278951

URL: http://svn.apache.org/viewcvs?rev=278951&view=rev
Log:
parse-mspowerpoint plugin uses the generic outlink extractor instead of its own (Stephan Strittmatter)

Removed:
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java
Modified:
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java Tue Sep  6 02:17:05 2005
@@ -25,7 +25,6 @@
 import org.apache.poi.hdf.extractor.Utils;
 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.StringUtil;
@@ -272,10 +271,9 @@
                   // TextCharAtom record
 
                   final String strTempContent = new String(pptdata,
-                      (int) startPos + 6, (int) (nsize) + 2);
+                      startPos + 6, (int) (nsize) + 2);
                   final byte bytes[] = strTempContent.getBytes();
                   if (true) {
-                    // FIXME my version
                     outStream = new FilteredStringWriter();
                     for (int ii = 0; ii < bytes.length - 1; ii += 2) {
                       // For loop to changed to a function

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Tue Sep  6 02:17:05 2005
@@ -23,6 +23,7 @@
 import java.util.logging.Logger;
 
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
@@ -56,13 +57,6 @@
   }
 
   /**
-   * 
-   */
-  public MSPowerPointParser(String fileName) {
-
-  }
-
-  /**
    * Main for testing. Pass a ppt-file as argument
    * 
    * @param args
@@ -128,7 +122,7 @@
 
       plainText = extractor.getText();
       properties = extractor.getProperties();
-      outlinks = this.getOutlinks(plainText, content.getUrl());
+      outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl());
 
     } catch (Exception e) {
       LOG.throwing(this.getClass().getName(), "getParse", e);
@@ -158,17 +152,6 @@
 
     LOG.finest("PowerPoint file parsed sucessful.");
     return new ParseImpl(plainText, parseData);
-  }
-
-  /**
-   * Collect outlinks of document.
-   * 
-   * @param plainText
-   * 
-   * @return Array of links within the PowerPoint file
-   */
-  protected Outlink[] getOutlinks(String plainText, String anchor) {
-    return OutlinkExtractor.getOutlinks(plainText, anchor);
   }
   
   private final static byte[] getRawBytes(File f) {

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java Tue Sep  6 02:17:05 2005
@@ -27,6 +27,7 @@
   /** ID of master slide */
   public static final long PPT_MASTERSLIDE = 1024L;
 
+  /** ATOM ID of slide */
   public static long PPT_ATOM_SLIDE = 1007l;
 
   /** ATOM ID of notes */

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java Tue Sep  6 02:17:05 2005
@@ -15,7 +15,6 @@
  */
 package org.apache.nutch.parse.mspowerpoint;
 
-import java.io.IOException;
 
 /**
  * Exception class used for catching the runtime exceptions for the Powerpoint

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Tue Sep  6 02:17:05 2005
@@ -23,7 +23,6 @@
 
 import org.apache.nutch.parse.mspowerpoint.PPTExtractor.PropertiesBroker;
 import org.apache.nutch.util.LogFormatter;
-import org.apache.poi.hpsf.PropertySet;
 import org.apache.poi.hpsf.PropertySetFactory;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;