You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/06 11:17:12 UTC
svn commit: r278951 -
/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/
Author: jerome
Date: Tue Sep 6 02:17:05 2005
New Revision: 278951
URL: http://svn.apache.org/viewcvs?rev=278951&view=rev
Log:
parse-mspowerpoint plugin uses the generic outlink extractor instead of its own (Stephan Strittmatter)
Removed:
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/OutlinkExtractor.java
Modified:
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java Tue Sep 6 02:17:05 2005
@@ -25,7 +25,6 @@
import org.apache.poi.hdf.extractor.Utils;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
@@ -272,10 +271,9 @@
// TextCharAtom record
final String strTempContent = new String(pptdata,
- (int) startPos + 6, (int) (nsize) + 2);
+ startPos + 6, (int) (nsize) + 2);
final byte bytes[] = strTempContent.getBytes();
if (true) {
- // FIXME my version
outStream = new FilteredStringWriter();
for (int ii = 0; ii < bytes.length - 1; ii += 2) {
// For loop to changed to a function
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Tue Sep 6 02:17:05 2005
@@ -23,6 +23,7 @@
import java.util.logging.Logger;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
@@ -56,13 +57,6 @@
}
/**
- *
- */
- public MSPowerPointParser(String fileName) {
-
- }
-
- /**
* Main for testing. Pass a ppt-file as argument
*
* @param args
@@ -128,7 +122,7 @@
plainText = extractor.getText();
properties = extractor.getProperties();
- outlinks = this.getOutlinks(plainText, content.getUrl());
+ outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl());
} catch (Exception e) {
LOG.throwing(this.getClass().getName(), "getParse", e);
@@ -158,17 +152,6 @@
LOG.finest("PowerPoint file parsed sucessful.");
return new ParseImpl(plainText, parseData);
- }
-
- /**
- * Collect outlinks of document.
- *
- * @param plainText
- *
- * @return Array of links within the PowerPoint file
- */
- protected Outlink[] getOutlinks(String plainText, String anchor) {
- return OutlinkExtractor.getOutlinks(plainText, anchor);
}
private final static byte[] getRawBytes(File f) {
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTConstants.java Tue Sep 6 02:17:05 2005
@@ -27,6 +27,7 @@
/** ID of master slide */
public static final long PPT_MASTERSLIDE = 1024L;
+ /** ATOM ID of slide */
public static long PPT_ATOM_SLIDE = 1007l;
/** ATOM ID of notes */
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java Tue Sep 6 02:17:05 2005
@@ -15,7 +15,6 @@
*/
package org.apache.nutch.parse.mspowerpoint;
-import java.io.IOException;
/**
* Exception class used for catching the runtime exceptions for the Powerpoint
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java?rev=278951&r1=278950&r2=278951&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Tue Sep 6 02:17:05 2005
@@ -23,7 +23,6 @@
import org.apache.nutch.parse.mspowerpoint.PPTExtractor.PropertiesBroker;
import org.apache.nutch.util.LogFormatter;
-import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;