You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/13 22:26:17 UTC
svn commit: r377493 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/util/ plugin/ plugin/lib-parsems/
plugin/lib-parsems/src/ plugin/lib-parsems/src/java/
plugin/lib-parsems/src/java/org/ plugin/lib-parsems/src/java/org/apache/
plugin/lib-parsems/s...
Author: jerome
Date: Mon Feb 13 13:26:15 2006
New Revision: 377493
URL: http://svn.apache.org/viewcvs?rev=377493&view=rev
Log:
Add a mini framework for microsoft documents parsing
Added:
lucene/nutch/trunk/src/plugin/lib-parsems/
lucene/nutch/trunk/src/plugin/lib-parsems/build.xml (with props)
lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/lib-parsems/src/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (with props)
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java (with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=377493&r1=377492&r2=377493&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Mon Feb 13 13:26:15 2006
@@ -149,6 +149,14 @@
}
+ /**
+ * Checks if a string is empty (ie is null or empty).
+ */
+ public static boolean isEmpty(String str) {
+ return (str == null) || (str.equals(""));
+ }
+
+
private static HashMap encodingAliases = new HashMap();
/**
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=377493&r1=377492&r2=377493&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Feb 13 13:26:15 2006
@@ -14,6 +14,7 @@
<ant dir="lib-http" target="deploy"/>
<ant dir="lib-jakarta-poi" target="deploy"/>
<ant dir="lib-lucene-analyzers" target="deploy"/>
+ <ant dir="lib-parsems" target="deploy"/>
<ant dir="nutch-extensionpoints" target="deploy"/>
<ant dir="ontology" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
@@ -78,6 +79,7 @@
<ant dir="lib-http" target="clean"/>
<ant dir="lib-jakarta-poi" target="clean"/>
<ant dir="lib-lucene-analyzers" target="clean"/>
+ <ant dir="lib-parsems" target="clean"/>
<ant dir="nutch-extensionpoints" target="clean"/>
<ant dir="ontology" target="clean"/>
<ant dir="protocol-file" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/build.xml?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/build.xml Mon Feb 13 13:26:15 2006
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<project name="lib-parsems" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+ <path id="plugin.deps">
+ <fileset dir="../lib-jakarta-poi/lib">
+ <include name="*.jar" />
+ </fileset>
+ </path>
+
+</project>
Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml Mon Feb 13 13:26:15 2006
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ! A common framework for microsoft documents parsers implementations
+ !-->
+<plugin
+ id="lib-parsems"
+ name="Parse MS Documents Framework"
+ version="1.0"
+ provider-name="org.apache.nutch">
+
+ <runtime>
+ <library name="lib-parsems.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="lib-jakarta-poi"/>
+ </requires>
+
+</plugin>
Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Feb 13 13:26:15 2006
@@ -0,0 +1,161 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.ms;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+
+/**
+ * A generic Microsoft document parser.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class MSBaseParser implements Parser {
+
+ private Configuration conf;
+
+ protected static final Logger LOG =
+ LogFormatter.getLogger(MSBaseParser.class.getName());
+
+
+ /**
+ * Parses a Content with a specific {@link MSExtractor Microsoft document
+ * extractor.
+ */
+ protected Parse getParse(MSExtractor extractor, Content content) {
+
+ String text = null;
+ String title = null;
+ Outlink[] outlinks = null;
+ Properties properties = null;
+
+ try {
+ byte[] raw = content.getContent();
+ String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
+ if ((contentLength != null) &&
+ (raw.length != Integer.parseInt(contentLength))) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at " + raw.length +" bytes. " +
+ "Parser can't handle incomplete file.")
+ .getEmptyParse(this.conf);
+ }
+ extractor.extract(new ByteArrayInputStream(raw));
+ text = extractor.getText();
+ properties = extractor.getProperties();
+ outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf());
+
+ } catch (Exception e) {
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as micrsosoft document. " + e)
+ .getEmptyParse(this.conf);
+ }
+
+ // collect meta data
+ Metadata metadata = new Metadata();
+ title = properties.getProperty(DublinCore.TITLE);
+ properties.remove(DublinCore.TITLE);
+ metadata.setAll(properties);
+
+ if (text == null) { text = ""; }
+ if (title == null) { title = ""; }
+
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+ outlinks, content.getMetadata(),
+ metadata);
+ parseData.setConf(this.conf);
+ return new ParseImpl(text, parseData);
+ }
+
+
+ /**
+ * Main for testing. Pass a ms document as argument
+ */
+ public static void main(String mime, MSBaseParser parser, String args[]) {
+ if (args.length < 1) {
+ System.err.println("Usage:");
+ System.err.println("\t" + parser.getClass().getName() + " <file>");
+ System.exit(1);
+ }
+
+ String file = args[0];
+ byte[] raw = getRawBytes(new File(file));
+
+ Metadata meta = new Metadata();
+ meta.set(Response.CONTENT_LENGTH, "" + raw.length);
+ Content content = new Content(file, file, raw, mime, meta,
+ NutchConfiguration.create());
+
+ System.out.println(parser.getParse(content).getText());
+ }
+
+ private final static byte[] getRawBytes(File f) {
+ try {
+ if (!f.exists())
+ return null;
+ FileInputStream fin = new FileInputStream(f);
+ byte[] buffer = new byte[(int) f.length()];
+ fin.read(buffer);
+ fin.close();
+ return buffer;
+ } catch (Exception err) {
+ err.printStackTrace();
+ return null;
+ }
+
+ }
+
+
+ /* ---------------------------- *
+ * <implemenation:Configurable> *
+ * ---------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ----------------------------- *
+ * </implemenation:Configurable> *
+ * ----------------------------- */
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java Mon Feb 13 13:26:15 2006
@@ -0,0 +1,199 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.ms;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Office;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.util.StringUtil;
+
+// Jakarta POI imports
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+
+/**
+ * Defines a Microsoft document content extractor.
+ *
+ * @author Jérôme Charron
+ */
+public abstract class MSExtractor {
+
+ protected final static Logger LOG =
+ LogFormatter.getLogger(MSExtractor.class.getName());
+
+ private String text = null;
+ private POIFSReader reader = null;
+ private PropertiesBroker properties = null;
+
+
+ /** Constructs a new Microsoft document extractor. */
+ protected MSExtractor() { }
+
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ protected void extract(InputStream input) throws Exception {
+ // First, extract properties
+ this.reader = new POIFSReader();
+ this.properties = new PropertiesBroker();
+ this.reader.registerListener(
+ new PropertiesReaderListener(this.properties),
+ SummaryInformation.DEFAULT_STREAM_NAME);
+ input.reset();
+ if (input.available() > 0) {
+ reader.read(input);
+ }
+ // Then, extract text
+ input.reset();
+ this.text = extractText(input);
+ }
+
+ /**
+ * Extracts the text content from a Microsoft document input stream.
+ */
+ protected abstract String extractText(InputStream input) throws Exception;
+
+
+ /**
+ * Get the content text of the Microsoft document.
+ * @return the content text of the document
+ */
+ protected String getText() {
+ return this.text;
+ }
+
+
+ /**
+ * Get the <code>Properties</code> of the Microsoft document.
+ * @return the properties of the document
+ */
+ protected Properties getProperties() {
+ return properties.getProperties();
+ }
+
+
+ private final static class PropertiesBroker {
+
+ private final static int TIMEOUT = 2 * 1000;
+ private Properties properties = null;
+
+ public synchronized Properties getProperties() {
+
+ final long start = new Date().getTime();
+ long now = start;
+
+ while (this.properties == null && now - start < TIMEOUT) {
+ try {
+ wait(TIMEOUT / 10);
+ } catch (InterruptedException e) {
+ }
+ now = new Date().getTime();
+ }
+ notifyAll();
+ return this.properties;
+ }
+
+ public synchronized void setProperties(Properties properties) {
+ this.properties = properties;
+ notifyAll();
+ }
+ }
+
+
+ private class PropertiesReaderListener implements POIFSReaderListener {
+
+ private PropertiesBroker propertiesBroker;
+ private Properties metadata = new Properties();
+
+ PropertiesReaderListener(PropertiesBroker propertiesBroker) {
+ this.propertiesBroker = propertiesBroker;
+ }
+
+ public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+ if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
+ return;
+ }
+
+ try {
+ SummaryInformation si = (SummaryInformation)
+ PropertySetFactory.create(event.getStream());
+ setProperty(DublinCore.TITLE, si.getTitle());
+ setProperty(Office.APPLICATION_NAME, si.getApplicationName());
+ setProperty(Office.AUTHOR, si.getAuthor());
+ setProperty(Office.CHARACTER_COUNT, si.getCharCount());
+ setProperty(Office.COMMENTS, si.getComments());
+ setProperty(DublinCore.DATE, si.getCreateDateTime());
+// setProperty(Office.EDIT_TIME, si.getEditTime());
+ setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
+ setProperty(Office.KEYWORDS, si.getKeywords());
+ setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
+ setProperty(Office.LAST_PRINTED, si.getLastPrinted());
+ setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
+ setProperty(Office.PAGE_COUNT, si.getPageCount());
+ setProperty(Office.REVISION_NUMBER, si.getRevNumber());
+ setProperty(DublinCore.RIGHTS, si.getSecurity());
+ setProperty(DublinCore.SUBJECT, si.getSubject());
+ setProperty(Office.TEMPLATE, si.getTemplate());
+ setProperty(Office.WORD_COUNT, si.getWordCount());
+ } catch (Exception ex) {
+ }
+ propertiesBroker.setProperties(metadata);
+ }
+
+ private final void setProperty(String name, String value) {
+ if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) {
+ metadata.setProperty(name, value);
+ }
+ }
+
+ private final void setProperty(String name, int value) {
+ if (value != 0) {
+ setProperty(name, String.valueOf(value));
+ }
+ }
+
+ private final void setProperty(String name, long value) {
+ if (value != 0) {
+ setProperty(name, String.valueOf(value));
+ }
+ }
+
+ private final void setProperty(String name, Date date) {
+ if (date != null) {
+ setProperty(name, HttpDateFormat.toString(date));
+ }
+ }
+
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native