You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/13 22:26:17 UTC
svn commit: r377493 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/ plugin/ plugin/lib-parsems/ plugin/lib-parsems/src/ plugin/lib-parsems/src/java/ plugin/lib-parsems/src/java/org/ plugin/lib-parsems/src/java/org/apache/ plugin/lib-parsems/s...

Author: jerome
Date: Mon Feb 13 13:26:15 2006
New Revision: 377493

URL: http://svn.apache.org/viewcvs?rev=377493&view=rev
Log:
Add a mini framework for microsoft documents parsing

Added:
    lucene/nutch/trunk/src/plugin/lib-parsems/
    lucene/nutch/trunk/src/plugin/lib-parsems/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/lib-parsems/src/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java   (with props)
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=377493&r1=377492&r2=377493&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Mon Feb 13 13:26:15 2006
@@ -149,6 +149,14 @@
 
   }
 
+  /**
+   * Checks if a string is empty (ie is null or empty).
+   */
+  public static boolean isEmpty(String str) {
+    return (str == null) || (str.equals(""));
+  }
+  
+  
   private static HashMap encodingAliases = new HashMap();
 
   /** 

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=377493&r1=377492&r2=377493&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Feb 13 13:26:15 2006
@@ -14,6 +14,7 @@
      <ant dir="lib-http" target="deploy"/>
      <ant dir="lib-jakarta-poi" target="deploy"/>
      <ant dir="lib-lucene-analyzers" target="deploy"/>
+     <ant dir="lib-parsems" target="deploy"/>
      <ant dir="nutch-extensionpoints" target="deploy"/>
      <ant dir="ontology" target="deploy"/>
      <ant dir="protocol-file" target="deploy"/>
@@ -78,6 +79,7 @@
     <ant dir="lib-http" target="clean"/>
     <ant dir="lib-jakarta-poi" target="clean"/>
     <ant dir="lib-lucene-analyzers" target="clean"/>
+    <ant dir="lib-parsems" target="clean"/>
     <ant dir="nutch-extensionpoints" target="clean"/>
     <ant dir="ontology" target="clean"/>
     <ant dir="protocol-file" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/build.xml?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/build.xml Mon Feb 13 13:26:15 2006
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<project name="lib-parsems" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <path id="plugin.deps">
+    <fileset dir="../lib-jakarta-poi/lib">
+      <include name="*.jar" />
+    </fileset>
+  </path>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml Mon Feb 13 13:26:15 2006
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ! A common framework for microsoft documents parsers implementations
+ !-->
+<plugin
+   id="lib-parsems"
+   name="Parse MS Documents Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-parsems.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+   <requires>
+      <import plugin="lib-jakarta-poi"/>
+   </requires>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Feb 13 13:26:15 2006
@@ -0,0 +1,161 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.ms;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+
+/**
+ * A generic Microsoft document parser.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class MSBaseParser implements Parser {
+  
+  private Configuration conf;
+  
+  protected static final Logger LOG =
+          LogFormatter.getLogger(MSBaseParser.class.getName());
+
+
+  /**
+   * Parses a Content with a specific {@link MSExtractor Microsoft document
+   * extractor.
+   */
+  protected Parse getParse(MSExtractor extractor, Content content) {
+    
+    String text = null;
+    String title = null;
+    Outlink[] outlinks = null;
+    Properties properties = null;
+    
+    try {
+      byte[] raw = content.getContent();
+      String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
+      if ((contentLength != null) &&
+          (raw.length != Integer.parseInt(contentLength))) {
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_TRUNCATED,
+                               "Content truncated at " + raw.length +" bytes. " +
+                               "Parser can't handle incomplete file.")
+                               .getEmptyParse(this.conf);
+      }
+      extractor.extract(new ByteArrayInputStream(raw));
+      text = extractor.getText();
+      properties = extractor.getProperties();
+      outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf());
+      
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+                             "Can't be handled as micrsosoft document. " + e)
+                             .getEmptyParse(this.conf);
+    }
+    
+    // collect meta data
+    Metadata metadata = new Metadata();
+    title = properties.getProperty(DublinCore.TITLE);
+    properties.remove(DublinCore.TITLE);
+    metadata.setAll(properties);
+
+    if (text == null) { text = ""; }
+    if (title == null) { title = ""; }
+
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+                                        outlinks, content.getMetadata(),
+                                        metadata);
+    parseData.setConf(this.conf);
+    return new ParseImpl(text, parseData);
+  }
+
+  
+  /**
+   * Main for testing. Pass a ms document as argument
+   */
+  public static void main(String mime, MSBaseParser parser, String args[]) {
+    if (args.length < 1) {
+      System.err.println("Usage:");
+      System.err.println("\t" + parser.getClass().getName() + " <file>");
+      System.exit(1);
+    }
+
+    String file = args[0];
+    byte[] raw = getRawBytes(new File(file));
+
+    Metadata meta = new Metadata();
+    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
+    Content content = new Content(file, file, raw, mime, meta,
+                                  NutchConfiguration.create());
+
+    System.out.println(parser.getParse(content).getText());
+  }
+
+  private final static byte[] getRawBytes(File f) {
+    try {
+      if (!f.exists())
+        return null;
+      FileInputStream fin = new FileInputStream(f);
+      byte[] buffer = new byte[(int) f.length()];
+      fin.read(buffer);
+      fin.close();
+      return buffer;
+    } catch (Exception err) {
+      err.printStackTrace();
+      return null;
+    }
+
+  }
+  
+
+  /* ---------------------------- *
+   * <implemenation:Configurable> *
+   * ---------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /* ----------------------------- *
+   * </implemenation:Configurable> *
+   * ----------------------------- */
+
+}

Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java Mon Feb 13 13:26:15 2006
@@ -0,0 +1,199 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.ms;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Office;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.util.StringUtil;
+
+// Jakarta POI imports
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+
+/**
+ * Defines a Microsoft document content extractor.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class MSExtractor {
+  
+  protected final static Logger LOG = 
+          LogFormatter.getLogger(MSExtractor.class.getName());
+
+  private String text = null;
+  private POIFSReader reader = null;
+  private PropertiesBroker properties = null;
+  
+
+  /** Constructs a new Microsoft document extractor. */
+  protected MSExtractor() { }
+
+  
+  /**
+   * Extracts properties and text from an MS Document input stream
+   */
+  protected void extract(InputStream input) throws Exception {
+    // First, extract properties
+    this.reader = new POIFSReader();
+    this.properties = new PropertiesBroker();
+    this.reader.registerListener(
+            new PropertiesReaderListener(this.properties),
+            SummaryInformation.DEFAULT_STREAM_NAME);
+    input.reset();
+    if (input.available() > 0) {
+      reader.read(input);
+    }
+    // Then, extract text
+    input.reset();
+    this.text = extractText(input);
+  }
+
+  /**
+   * Extracts the text content from a Microsoft document input stream.
+   */
+  protected abstract String extractText(InputStream input) throws Exception;
+  
+  
+  /**
+   * Get the content text of the Microsoft document.
+   * @return the content text of the document
+   */
+  protected String getText() {
+    return this.text;
+  }
+  
+
+  /**
+   * Get the <code>Properties</code> of the Microsoft document.
+   * @return the properties of the document
+   */
+  protected Properties getProperties() {
+    return properties.getProperties();
+  }
+
+  
+  private final static class PropertiesBroker {
+
+    private final static int TIMEOUT = 2 * 1000;
+    private Properties properties = null;
+
+    public synchronized Properties getProperties() {
+
+      final long start = new Date().getTime();
+      long now = start;
+
+      while (this.properties == null && now - start < TIMEOUT) {
+        try {
+          wait(TIMEOUT / 10);
+        } catch (InterruptedException e) {
+        }
+        now = new Date().getTime();
+      }
+      notifyAll();
+      return this.properties;
+    }
+
+    public synchronized void setProperties(Properties properties) {
+      this.properties = properties;
+      notifyAll();
+    }
+  }
+  
+  
+  private class PropertiesReaderListener implements POIFSReaderListener {
+    
+    private PropertiesBroker propertiesBroker;
+    private Properties metadata = new Properties();
+    
+    PropertiesReaderListener(PropertiesBroker propertiesBroker) {
+      this.propertiesBroker = propertiesBroker;
+    }
+    
+    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+      if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
+        return;
+      }
+      
+      try {
+        SummaryInformation si = (SummaryInformation)
+                                  PropertySetFactory.create(event.getStream());
+        setProperty(DublinCore.TITLE, si.getTitle());
+        setProperty(Office.APPLICATION_NAME, si.getApplicationName());
+        setProperty(Office.AUTHOR, si.getAuthor());
+        setProperty(Office.CHARACTER_COUNT, si.getCharCount());
+        setProperty(Office.COMMENTS, si.getComments());
+        setProperty(DublinCore.DATE, si.getCreateDateTime());
+//        setProperty(Office.EDIT_TIME, si.getEditTime());
+        setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
+        setProperty(Office.KEYWORDS, si.getKeywords());
+        setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
+        setProperty(Office.LAST_PRINTED, si.getLastPrinted());
+        setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
+        setProperty(Office.PAGE_COUNT, si.getPageCount());
+        setProperty(Office.REVISION_NUMBER, si.getRevNumber());
+        setProperty(DublinCore.RIGHTS, si.getSecurity());
+        setProperty(DublinCore.SUBJECT, si.getSubject());
+        setProperty(Office.TEMPLATE, si.getTemplate());
+        setProperty(Office.WORD_COUNT, si.getWordCount());
+      } catch (Exception ex) {
+      }
+      propertiesBroker.setProperties(metadata);
+    }
+    
+    private final void setProperty(String name, String value) {
+      if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) {
+        metadata.setProperty(name, value);
+      }
+    }
+
+    private final void setProperty(String name, int value) {
+      if (value != 0) {
+        setProperty(name, String.valueOf(value));
+      }
+    }
+
+    private final void setProperty(String name, long value) {
+      if (value != 0) {
+        setProperty(name, String.valueOf(value));
+      }
+    }
+
+    private final void setProperty(String name, Date date) {
+      if (date != null) {
+        setProperty(name, HttpDateFormat.toString(date));
+      }
+    }
+
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native