You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:58 UTC
[42/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
new file mode 100644
index 0000000..daf96e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -0,0 +1,278 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * A reader to load the information stored in the
+ * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+class ParsePluginsReader {
+
+  /* our log stream */
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ParsePluginsReader.class);
+
+  /** The property name of the parse-plugins location */
+  private static final String PP_FILE_PROP = "parse.plugin.file";
+
+  /** the parse-plugins file */
+  private String fParsePluginsFile = null;
+
+  /**
+   * Constructs a new ParsePluginsReader
+   */
+  public ParsePluginsReader() {
+  }
+
+  /**
+   * Reads the <code>parse-plugins.xml</code> file and returns the
+   * {@link #ParsePluginList} defined by it.
+   * 
+   * @return A {@link #ParsePluginList} specified by the
+   *         <code>parse-plugins.xml</code> file.
+   * @throws Exception
+   *           If any parsing error occurs.
+   */
+  public ParsePluginList parse(Configuration conf) {
+
+    ParsePluginList pList = new ParsePluginList();
+
+    // open up the XML file
+    DocumentBuilderFactory factory = null;
+    DocumentBuilder parser = null;
+    Document document = null;
+    InputSource inputSource = null;
+
+    InputStream ppInputStream = null;
+    if (fParsePluginsFile != null) {
+      URL parsePluginUrl = null;
+      try {
+        parsePluginUrl = new URL(fParsePluginsFile);
+        ppInputStream = parsePluginUrl.openStream();
+      } catch (Exception e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Unable to load parse plugins file from URL " + "["
+              + fParsePluginsFile + "]. Reason is [" + e + "]");
+        }
+        return pList;
+      }
+    } else {
+      ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
+    }
+
+    inputSource = new InputSource(ppInputStream);
+
+    try {
+      factory = DocumentBuilderFactory.newInstance();
+      parser = factory.newDocumentBuilder();
+      document = parser.parse(inputSource);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is ["
+            + e + "]");
+      }
+      return null;
+    }
+
+    Element parsePlugins = document.getDocumentElement();
+
+    // build up the alias hash map
+    Map<String, String> aliases = getAliases(parsePlugins);
+    // And store it on the parse plugin list
+    pList.setAliases(aliases);
+
+    // get all the mime type nodes
+    NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
+
+    // iterate through the mime types
+    for (int i = 0; i < mimeTypes.getLength(); i++) {
+      Element mimeType = (Element) mimeTypes.item(i);
+      String mimeTypeStr = mimeType.getAttribute("name");
+
+      // for each mimeType, get the plugin list
+      NodeList pluginList = mimeType.getElementsByTagName("plugin");
+
+      // iterate through the plugins, add them in order read
+      // OR if they have a special order="" attribute, then hold those in
+      // a separate list, and then insert them into the final list at the
+      // order specified
+      if (pluginList != null && pluginList.getLength() > 0) {
+        List<String> plugList = new ArrayList<String>(pluginList.getLength());
+
+        for (int j = 0; j < pluginList.getLength(); j++) {
+          Element plugin = (Element) pluginList.item(j);
+          String pluginId = plugin.getAttribute("id");
+          String extId = aliases.get(pluginId);
+          if (extId == null) {
+            // Assume an extension id is directly specified
+            extId = pluginId;
+          }
+          String orderStr = plugin.getAttribute("order");
+          int order = -1;
+          try {
+            order = Integer.parseInt(orderStr);
+          } catch (NumberFormatException ignore) {
+          }
+          if (order != -1) {
+            plugList.add(order - 1, extId);
+          } else {
+            plugList.add(extId);
+          }
+        }
+
+        // now add the plugin list and map it to this mimeType
+        pList.setPluginList(mimeTypeStr, plugList);
+
+      } else if (LOG.isWarnEnabled()) {
+        LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: "
+            + mimeTypeStr + ", continuing parse");
+      }
+    }
+    return pList;
+  }
+
+  /**
+   * Tests parsing of the parse-plugins.xml file. An alternative name for the
+   * file can be specified via the <code>--file</code> option, although the file
+   * must be located in the <code>$NUTCH_HOME/conf</code> directory.
+   * 
+   * @param args
+   *          Currently only the --file argument to specify an alternative name
+   *          for the parse-plugins.xml file is supported.
+   */
+  public static void main(String[] args) throws Exception {
+    String parsePluginFile = null;
+    String usage = "ParsePluginsReader [--file <parse plugin file location>]";
+
+    if ((args.length != 0 && args.length != 2)
+        || (args.length == 2 && !"--file".equals(args[0]))) {
+      System.err.println(usage);
+      System.exit(1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("--file")) {
+        parsePluginFile = args[++i];
+      }
+    }
+
+    ParsePluginsReader reader = new ParsePluginsReader();
+
+    if (parsePluginFile != null) {
+      reader.setFParsePluginsFile(parsePluginFile);
+    }
+
+    ParsePluginList prefs = reader.parse(NutchConfiguration.create());
+
+    for (String mimeType : prefs.getSupportedMimeTypes()) {
+
+      System.out.println("MIMETYPE: " + mimeType);
+      List<String> plugList = prefs.getPluginList(mimeType);
+
+      System.out.println("EXTENSION IDs:");
+
+      for (String j : plugList) {
+        System.out.println(j);
+      }
+    }
+
+  }
+
+  /**
+   * @return Returns the fParsePluginsFile.
+   */
+  public String getFParsePluginsFile() {
+    return fParsePluginsFile;
+  }
+
+  /**
+   * @param parsePluginsFile
+   *          The fParsePluginsFile to set.
+   */
+  public void setFParsePluginsFile(String parsePluginsFile) {
+    fParsePluginsFile = parsePluginsFile;
+  }
+
+  private Map<String, String> getAliases(Element parsePluginsRoot) {
+
+    Map<String, String> aliases = new HashMap<String, String>();
+    NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
+
+    if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("No aliases defined in parse-plugins.xml!");
+      }
+      return aliases;
+    }
+
+    if (aliasRoot.getLength() > 1) {
+      // log a warning, but try and continue processing
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml");
+      }
+    }
+
+    Element aliasRootElem = (Element) aliasRoot.item(0);
+    NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
+
+    if (aliasElements != null && aliasElements.getLength() > 0) {
+      for (int i = 0; i < aliasElements.getLength(); i++) {
+        Element aliasElem = (Element) aliasElements.item(i);
+        String parsePluginId = aliasElem.getAttribute("name");
+        String extensionId = aliasElem.getAttribute("extension-id");
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("Found alias: plugin-id: " + parsePluginId
+              + ", extension-id: " + extensionId);
+        }
+        if (parsePluginId != null && extensionId != null) {
+          aliases.put(parsePluginId, extensionId);
+        }
+      }
+    }
+    return aliases;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
new file mode 100644
index 0000000..92d8871
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.Text;
+
+/**
+ * A utility class that stores result of a parse. Internally a ParseResult
+ * stores &lt;{@link Text}, {@link Parse}&gt; pairs.
+ * <p>
+ * Parsers may return multiple results, which correspond to parts or other
+ * associated documents related to the original URL.
+ * </p>
+ * <p>
+ * There will be usually one parse result that corresponds directly to the
+ * original URL, and possibly many (or none) results that correspond to derived
+ * URLs (or sub-URLs).
+ */
+public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
+  private Map<Text, Parse> parseMap;
+  private String originalUrl;
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParseResult.class);
+
+  /**
+   * Create a container for parse results.
+   * 
+   * @param originalUrl
+   *          the original url from which all parse results have been obtained.
+   */
+  public ParseResult(String originalUrl) {
+    parseMap = new HashMap<Text, Parse>();
+    this.originalUrl = originalUrl;
+  }
+
+  /**
+   * Convenience method for obtaining {@link ParseResult} from a single
+   * <code>Parse</code> output.
+   * 
+   * @param url
+   *          canonical url.
+   * @param parse
+   *          single parse output.
+   * @return result containing the single parse output.
+   */
+  public static ParseResult createParseResult(String url, Parse parse) {
+    ParseResult parseResult = new ParseResult(url);
+    parseResult.put(new Text(url), new ParseText(parse.getText()),
+        parse.getData());
+    return parseResult;
+  }
+
+  /**
+   * Checks whether the result is empty.
+   * 
+   * @return
+   */
+  public boolean isEmpty() {
+    return parseMap.isEmpty();
+  }
+
+  /**
+   * Return the number of parse outputs (both successful and failed)
+   */
+  public int size() {
+    return parseMap.size();
+  }
+
+  /**
+   * Retrieve a single parse output.
+   * 
+   * @param key
+   *          sub-url under which the parse output is stored.
+   * @return parse output corresponding to this sub-url, or null.
+   */
+  public Parse get(String key) {
+    return get(new Text(key));
+  }
+
+  /**
+   * Retrieve a single parse output.
+   * 
+   * @param key
+   *          sub-url under which the parse output is stored.
+   * @return parse output corresponding to this sub-url, or null.
+   */
+  public Parse get(Text key) {
+    return parseMap.get(key);
+  }
+
+  /**
+   * Store a result of parsing.
+   * 
+   * @param key
+   *          URL or sub-url of this parse result
+   * @param text
+   *          plain text result
+   * @param data
+   *          corresponding parse metadata of this result
+   */
+  public void put(Text key, ParseText text, ParseData data) {
+    put(key.toString(), text, data);
+  }
+
+  /**
+   * Store a result of parsing.
+   * 
+   * @param key
+   *          URL or sub-url of this parse result
+   * @param text
+   *          plain text result
+   * @param data
+   *          corresponding parse metadata of this result
+   */
+  public void put(String key, ParseText text, ParseData data) {
+    parseMap.put(new Text(key),
+        new ParseImpl(text, data, key.equals(originalUrl)));
+  }
+
+  /**
+   * Iterate over all entries in the &lt;url, Parse&gt; map.
+   */
+  public Iterator<Entry<Text, Parse>> iterator() {
+    return parseMap.entrySet().iterator();
+  }
+
+  /**
+   * Remove all results where status is not successful (as determined by
+   * </code>ParseStatus#isSuccess()</code>). Note that effects of this operation
+   * cannot be reversed.
+   */
+  public void filter() {
+    for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+      Entry<Text, Parse> entry = i.next();
+      if (!entry.getValue().getData().getStatus().isSuccess()) {
+        LOG.warn(entry.getKey() + " is not parsed successfully, filtering");
+        i.remove();
+      }
+    }
+
+  }
+
+  /**
+   * A convenience method which returns true only if all parses are successful.
+   * Parse success is determined by <code>ParseStatus#isSuccess()</code>.
+   */
+  public boolean isSuccess() {
+    for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+      Entry<Text, Parse> entry = i.next();
+      if (!entry.getValue().getData().getStatus().isSuccess()) {
+        return false;
+      }
+    }
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
new file mode 100644
index 0000000..b008bed
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
@@ -0,0 +1,309 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.Path;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.Map.Entry;
+
+/* Parse content in a segment. */
+public class ParseSegment extends NutchTool implements Tool,
+    Mapper<WritableComparable<?>, Content, Text, ParseImpl>,
+    Reducer<Text, Writable, Text, Writable> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
+
+  public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+
+  private ScoringFilters scfilters;
+
+  private ParseUtil parseUtil;
+
+  private boolean skipTruncated;
+
+  public ParseSegment() {
+    this(null);
+  }
+
+  public ParseSegment(Configuration conf) {
+    super(conf);
+  }
+
+  public void configure(JobConf job) {
+    setConf(job);
+    this.scfilters = new ScoringFilters(job);
+    skipTruncated = job.getBoolean(SKIP_TRUNCATED, true);
+  }
+
+  public void close() {
+  }
+
+  private Text newKey = new Text();
+
+  public void map(WritableComparable<?> key, Content content,
+      OutputCollector<Text, ParseImpl> output, Reporter reporter)
+      throws IOException {
+    // convert on the fly from old UTF8 keys
+    if (key instanceof Text) {
+      newKey.set(key.toString());
+      key = newKey;
+    }
+
+    int status = Integer.parseInt(content.getMetadata().get(
+        Nutch.FETCH_STATUS_KEY));
+    if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
+      // content not fetched successfully, skip document
+      LOG.debug("Skipping " + key + " as content is not fetched successfully");
+      return;
+    }
+
+    if (skipTruncated && isTruncated(content)) {
+      return;
+    }
+
+    long start = System.currentTimeMillis();
+    ParseResult parseResult = null;
+    try {
+      if (parseUtil == null)
+        parseUtil = new ParseUtil(getConf());
+      parseResult = parseUtil.parse(content);
+    } catch (Exception e) {
+      LOG.warn("Error parsing: " + key + ": "
+          + StringUtils.stringifyException(e));
+      return;
+    }
+
+    for (Entry<Text, Parse> entry : parseResult) {
+      Text url = entry.getKey();
+      Parse parse = entry.getValue();
+      ParseStatus parseStatus = parse.getData().getStatus();
+
+      reporter.incrCounter("ParserStatus",
+          ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+
+      if (!parseStatus.isSuccess()) {
+        LOG.warn("Error parsing: " + key + ": " + parseStatus);
+        parse = parseStatus.getEmptyParse(getConf());
+      }
+
+      // pass segment name to parse data
+      parse.getData().getContentMeta()
+          .set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
+
+      // compute the new signature
+      byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+          content, parse);
+      parse.getData().getContentMeta()
+          .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+
+      try {
+        scfilters.passScoreAfterParsing(url, content, parse);
+      } catch (ScoringFilterException e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Error passing score: " + url + ": " + e.getMessage());
+        }
+      }
+
+      long end = System.currentTimeMillis();
+      LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url);
+
+      output.collect(
+          url,
+          new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse
+              .isCanonical()));
+    }
+  }
+
+  /**
+   * Checks if the page's content is truncated.
+   * 
+   * @param content
+   * @return If the page is truncated <code>true</code>. When it is not, or when
+   *         it could be determined, <code>false</code>.
+   */
+  public static boolean isTruncated(Content content) {
+    byte[] contentBytes = content.getContent();
+    if (contentBytes == null)
+      return false;
+    Metadata metadata = content.getMetadata();
+    if (metadata == null)
+      return false;
+
+    String lengthStr = metadata.get(Response.CONTENT_LENGTH);
+    if (lengthStr != null)
+      lengthStr = lengthStr.trim();
+    if (StringUtil.isEmpty(lengthStr)) {
+      return false;
+    }
+    int inHeaderSize;
+    String url = content.getUrl();
+    try {
+      inHeaderSize = Integer.parseInt(lengthStr);
+    } catch (NumberFormatException e) {
+      LOG.warn("Wrong contentlength format for " + url, e);
+      return false;
+    }
+    int actualSize = contentBytes.length;
+    if (inHeaderSize > actualSize) {
+      LOG.info(url + " skipped. Content of size " + inHeaderSize
+          + " was truncated to " + actualSize);
+      return true;
+    }
+    if (LOG.isDebugEnabled()) {
+      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+          + inHeaderSize);
+    }
+    return false;
+  }
+
+  public void reduce(Text key, Iterator<Writable> values,
+      OutputCollector<Text, Writable> output, Reporter reporter)
+      throws IOException {
+    output.collect(key, values.next()); // collect first value
+  }
+
+  public void parse(Path segment) throws IOException {
+     if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+	  LOG.warn("Segment: " + segment
+	  + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
+          return;
+      }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("ParseSegment: starting at " + sdf.format(start));
+      LOG.info("ParseSegment: segment: " + segment);
+    }
+
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("parse " + segment);
+
+    FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(ParseSegment.class);
+    job.setReducerClass(ParseSegment.class);
+
+    FileOutputFormat.setOutputPath(job, segment);
+    job.setOutputFormat(ParseOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(ParseImpl.class);
+
+    JobClient.runJob(job);
+    long end = System.currentTimeMillis();
+    LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(),
+        args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    Path segment;
+
+    String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    if (args.length > 1) {
+      for (int i = 1; i < args.length; i++) {
+        String param = args[i];
+
+        if ("-nofilter".equalsIgnoreCase(param)) {
+          getConf().setBoolean("parse.filter.urls", false);
+        } else if ("-nonormalize".equalsIgnoreCase(param)) {
+          getConf().setBoolean("parse.normalize.urls", false);
+        }
+      }
+    }
+
+    segment = new Path(args[0]);
+    parse(segment);
+    return 0;
+  }
+
+  /*
+   * Used for Nutch REST service
+   */
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    Path segment;
+    if(args.containsKey(Nutch.ARG_SEGMENT)) {
+    	Object seg = args.get(Nutch.ARG_SEGMENT);
+    	if(seg instanceof Path) {
+    		segment = (Path) seg;
+    	}
+    	else {
+    		segment = new Path(seg.toString());
+    	}
+    }
+    else {
+    	String segment_dir = crawlId+"/segments";
+        File segmentsDir = new File(segment_dir);
+        File[] segmentsList = segmentsDir.listFiles();  
+        Arrays.sort(segmentsList, new Comparator<File>(){
+          @Override
+          public int compare(File f1, File f2) {
+            if(f1.lastModified()>f2.lastModified())
+              return -1;
+            else
+              return 0;
+          }      
+        });
+        segment = new Path(segmentsList[0].getPath());
+    }
+    
+    if (args.containsKey("nofilter")) {
+      getConf().setBoolean("parse.filter.urls", false);
+    }
+    if (args.containsKey("nonormalize")) {
+      getConf().setBoolean("parse.normalize.urls", false);
+    }
+    parse(segment);
+    results.put(Nutch.VAL_RESULT, Integer.toString(0));
+    return results;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
new file mode 100644
index 0000000..b9d5959
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki &lt;ab@getopt.org&gt;
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class ParseStatus implements Writable {
+
+  private final static byte VERSION = 2;
+
+  // Primary status codes:
+
+  /** Parsing was not performed. */
+  public static final byte NOTPARSED = 0;
+  /** Parsing succeeded. */
+  public static final byte SUCCESS = 1;
+  /** General failure. There may be a more specific error message in arguments. */
+  public static final byte FAILED = 2;
+
+  public static final String[] majorCodes = { "notparsed", "success", "failed" };
+
+  // Secondary success codes go here:
+
+  /**
+   * Parsed content contains a directive to redirect to another URL. The target
+   * URL can be retrieved from the arguments.
+   */
+  public static final short SUCCESS_REDIRECT = 100;
+
+  // Secondary failure codes go here:
+
+  /**
+   * Parsing failed. An Exception occured (which may be retrieved from the
+   * arguments).
+   */
+  public static final short FAILED_EXCEPTION = 200;
+  /**
+   * Parsing failed. Content was truncated, but the parser cannot handle
+   * incomplete content.
+   */
+  public static final short FAILED_TRUNCATED = 202;
+  /**
+   * Parsing failed. Invalid format - the content may be corrupted or of wrong
+   * type.
+   */
+  public static final short FAILED_INVALID_FORMAT = 203;
+  /**
+   * Parsing failed. Other related parts of the content are needed to complete
+   * parsing. The list of URLs to missing parts may be provided in arguments.
+   * The Fetcher may decide to fetch these parts at once, then put them into
+   * Content.metadata, and supply them for re-parsing.
+   */
+  public static final short FAILED_MISSING_PARTS = 204;
+  /**
+   * Parsing failed. There was no content to be parsed - probably caused by
+   * errors at protocol stage.
+   */
+  public static final short FAILED_MISSING_CONTENT = 205;
+
+  public static final ParseStatus STATUS_NOTPARSED = new ParseStatus(NOTPARSED);
+  public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
+  public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
+
+  private byte majorCode = 0;
+  private short minorCode = 0;
+  private String[] args = null;
+
+  public byte getVersion() {
+    return VERSION;
+  }
+
+  public ParseStatus() {
+
+  }
+
+  public ParseStatus(int majorCode, int minorCode, String[] args) {
+    this.args = args;
+    this.majorCode = (byte) majorCode;
+    this.minorCode = (short) minorCode;
+  }
+
+  public ParseStatus(int majorCode) {
+    this(majorCode, 0, (String[]) null);
+  }
+
+  public ParseStatus(int majorCode, String[] args) {
+    this(majorCode, 0, args);
+  }
+
+  public ParseStatus(int majorCode, int minorCode) {
+    this(majorCode, minorCode, (String[]) null);
+  }
+
+  /** Simplified constructor for passing just a text message. */
+  public ParseStatus(int majorCode, int minorCode, String message) {
+    this(majorCode, minorCode, new String[] { message });
+  }
+
+  /** Simplified constructor for passing just a text message. */
+  public ParseStatus(int majorCode, String message) {
+    this(majorCode, 0, new String[] { message });
+  }
+
+  public ParseStatus(Throwable t) {
+    this(FAILED, FAILED_EXCEPTION, new String[] { t.toString() });
+  }
+
+  public static ParseStatus read(DataInput in) throws IOException {
+    ParseStatus res = new ParseStatus();
+    res.readFields(in);
+    return res;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte();
+    switch (version) {
+    case 1:
+      majorCode = in.readByte();
+      minorCode = in.readShort();
+      args = WritableUtils.readCompressedStringArray(in);
+      break;
+    case 2:
+      majorCode = in.readByte();
+      minorCode = in.readShort();
+      args = WritableUtils.readStringArray(in);
+      break;
+    default:
+      throw new VersionMismatchException(VERSION, version);
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION);
+    out.writeByte(majorCode);
+    out.writeShort(minorCode);
+    if (args == null) {
+      out.writeInt(-1);
+    } else {
+      WritableUtils.writeStringArray(out, args);
+    }
+  }
+
+  /**
+   * A convenience method. Returns true if majorCode is SUCCESS, false
+   * otherwise.
+   */
+
+  public boolean isSuccess() {
+    return majorCode == SUCCESS;
+  }
+
+  /**
+   * A convenience method. Return a String representation of the first argument,
+   * or null.
+   */
+  public String getMessage() {
+    if (args != null && args.length > 0 && args[0] != null)
+      return args[0];
+    return null;
+  }
+
+  public String[] getArgs() {
+    return args;
+  }
+
+  public int getMajorCode() {
+    return majorCode;
+  }
+
+  public int getMinorCode() {
+    return minorCode;
+  }
+
+  /**
+   * A convenience method. Creates an empty Parse instance, which returns this
+   * status.
+   */
+  public Parse getEmptyParse(Configuration conf) {
+    return new EmptyParseImpl(this, conf);
+  }
+
+  /**
+   * A convenience method. Creates an empty ParseResult, which contains this
+   * status.
+   */
+  public ParseResult getEmptyParseResult(String url, Configuration conf) {
+    return ParseResult.createParseResult(url, getEmptyParse(conf));
+  }
+
+  public String toString() {
+    StringBuffer res = new StringBuffer();
+    String name = null;
+    if (majorCode >= 0 && majorCode < majorCodes.length)
+      name = majorCodes[majorCode];
+    else
+      name = "UNKNOWN!";
+    res.append(name + "(" + majorCode + "," + minorCode + ")");
+    if (args != null) {
+      if (args.length == 1) {
+        res.append(": " + String.valueOf(args[0]));
+      } else {
+        for (int i = 0; i < args.length; i++) {
+          if (args[i] != null)
+            res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+        }
+      }
+    }
+    return res.toString();
+  }
+
+  public void setArgs(String[] args) {
+    this.args = args;
+  }
+
+  public void setMessage(String msg) {
+    if (args == null || args.length == 0) {
+      args = new String[1];
+    }
+    args[0] = msg;
+  }
+
+  public void setMajorCode(byte majorCode) {
+    this.majorCode = majorCode;
+  }
+
+  public void setMinorCode(short minorCode) {
+    this.minorCode = minorCode;
+  }
+
+  public boolean equals(Object o) {
+    if (o == null)
+      return false;
+    if (!(o instanceof ParseStatus))
+      return false;
+    boolean res = true;
+    ParseStatus other = (ParseStatus) o;
+    res = res && (this.majorCode == other.majorCode)
+        && (this.minorCode == other.minorCode);
+    if (!res)
+      return res;
+    if (this.args == null) {
+      if (other.args == null)
+        return true;
+      else
+        return false;
+    } else {
+      if (other.args == null)
+        return false;
+      if (other.args.length != this.args.length)
+        return false;
+      for (int i = 0; i < this.args.length; i++) {
+        if (!this.args[i].equals(other.args[i]))
+          return false;
+      }
+    }
+    return true;
+  }
+
+  private static class EmptyParseImpl implements Parse {
+
+    private ParseData data = null;
+
+    public EmptyParseImpl(ParseStatus status, Configuration conf) {
+      data = new ParseData(status, "", new Outlink[0], new Metadata(),
+          new Metadata());
+    }
+
+    public ParseData getData() {
+      return data;
+    }
+
+    public String getText() {
+      return "";
+    }
+
+    public boolean isCanonical() {
+      return true;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
new file mode 100644
index 0000000..13416cf
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.commons.cli.Options;
+import org.apache.nutch.util.NutchConfiguration;
+
+/* The text conversion of page's content, stored using gzip compression.
+ * @see Parse#getText()
+ */
+public final class ParseText implements Writable {
+  public static final String DIR_NAME = "parse_text";
+
+  private final static byte VERSION = 2;
+
+  public ParseText() {
+  }
+
+  private String text;
+
+  public ParseText(String text) {
+    this.text = text;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte();
+    switch (version) {
+    case 1:
+      text = WritableUtils.readCompressedString(in);
+      break;
+    case VERSION:
+      text = Text.readString(in);
+      break;
+    default:
+      throw new VersionMismatchException(VERSION, version);
+    }
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.write(VERSION);
+    Text.writeString(out, text);
+  }
+
+  public final static ParseText read(DataInput in) throws IOException {
+    ParseText parseText = new ParseText();
+    parseText.readFields(in);
+    return parseText;
+  }
+
+  //
+  // Accessor methods
+  //
+  public String getText() {
+    return text;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof ParseText))
+      return false;
+    ParseText other = (ParseText) o;
+    return this.text.equals(other.text);
+  }
+
+  public String toString() {
+    return text;
+  }
+
+  public static void main(String argv[]) throws Exception {
+    String usage = "ParseText (-local | -dfs <namenode:port>) recno segment";
+
+    if (argv.length < 3) {
+      System.out.println("usage:" + usage);
+      return;
+    }
+    Options opts = new Options();
+    Configuration conf = NutchConfiguration.create();
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
+    String[] remainingArgs = parser.getRemainingArgs();
+
+    FileSystem fs = FileSystem.get(conf);
+    try {
+      int recno = Integer.parseInt(remainingArgs[0]);
+      String segment = remainingArgs[1];
+      String filename = new Path(segment, ParseText.DIR_NAME).toString();
+
+      ParseText parseText = new ParseText();
+      ArrayFile.Reader parseTexts = new ArrayFile.Reader(fs, filename, conf);
+
+      parseTexts.get(recno, parseText);
+      System.out.println("Retrieved " + recno + " from file " + filename);
+      System.out.println(parseText);
+      parseTexts.close();
+    } finally {
+      fs.close();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
new file mode 100644
index 0000000..39024dc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// Commons Logging imports
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.protocol.Content;
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
+/**
+ * A Utility class containing methods to simply perform parsing utilities such
+ * as iterating through a preferred list of {@link Parser}s to obtain
+ * {@link Parse} objects.
+ * 
+ * @author mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ * @author S&eacute;bastien Le Callonnec
+ */
+public class ParseUtil {
+
+  /* our log stream */
+  public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
+  private ParserFactory parserFactory;
+  /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
+  private int maxParseTime = 30;
+  private ExecutorService executorService;
+
+  /**
+   * 
+   * @param conf
+   */
+  public ParseUtil(Configuration conf) {
+    this.parserFactory = new ParserFactory(conf);
+    maxParseTime = conf.getInt("parser.timeout", 30);
+    executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
+        .setNameFormat("parse-%d").setDaemon(true).build());
+  }
+
+  /**
+   * Performs a parse by iterating through a List of preferred {@link Parser}s
+   * until a successful parse is performed and a {@link Parse} object is
+   * returned. If the parse is unsuccessful, a message is logged to the
+   * <code>WARNING</code> level, and an empty parse is returned.
+   * 
+   * @param content
+   *          The content to try and parse.
+   * @return &lt;key, {@link Parse}&gt; pairs.
+   * @throws ParseException
+   *           If no suitable parser is found to perform the parse.
+   */
+  public ParseResult parse(Content content) throws ParseException {
+    Parser[] parsers = null;
+
+    try {
+      parsers = this.parserFactory.getParsers(content.getContentType(),
+          content.getUrl() != null ? content.getUrl() : "");
+    } catch (ParserNotFound e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("No suitable parser found when trying to parse content "
+            + content.getUrl() + " of type " + content.getContentType());
+      }
+      throw new ParseException(e.getMessage());
+    }
+
+    ParseResult parseResult = null;
+    for (int i = 0; i < parsers.length; i++) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i]
+            + "]");
+      }
+      if (maxParseTime != -1)
+        parseResult = runParser(parsers[i], content);
+      else
+        parseResult = parsers[i].getParse(content);
+
+      if (parseResult != null && !parseResult.isEmpty())
+        return parseResult;
+    }
+
+    if (LOG.isWarnEnabled()) {
+      LOG.warn("Unable to successfully parse content " + content.getUrl()
+          + " of type " + content.getContentType());
+    }
+    return new ParseStatus(new ParseException(
+        "Unable to successfully parse content")).getEmptyParseResult(
+        content.getUrl(), null);
+  }
+
+  /**
+   * Method parses a {@link Content} object using the {@link Parser} specified
+   * by the parameter <code>extId</code>, i.e., the Parser's extension ID. If a
+   * suitable {@link Parser} is not found, then a <code>WARNING</code> level
+   * message is logged, and a ParseException is thrown. If the parse is
+   * uncessful for any other reason, then a <code>WARNING</code> level message
+   * is logged, and a <code>ParseStatus.getEmptyParse()</code> is returned.
+   * 
+   * @param extId
+   *          The extension implementation ID of the {@link Parser} to use to
+   *          parse the specified content.
+   * @param content
+   *          The content to parse.
+   * 
+   * @return &lt;key, {@link Parse}&gt; pairs if the parse is successful,
+   *         otherwise, a single &lt;key,
+   *         <code>ParseStatus.getEmptyParse()</code>&gt; pair.
+   * 
+   * @throws ParseException
+   *           If there is no suitable {@link Parser} found to perform the
+   *           parse.
+   */
+  public ParseResult parseByExtensionId(String extId, Content content)
+      throws ParseException {
+    Parser p = null;
+
+    try {
+      p = this.parserFactory.getParserById(extId);
+    } catch (ParserNotFound e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("No suitable parser found when trying to parse content "
+            + content.getUrl() + " of type " + content.getContentType());
+      }
+      throw new ParseException(e.getMessage());
+    }
+
+    ParseResult parseResult = null;
+    if (maxParseTime != -1)
+      parseResult = runParser(p, content);
+    else
+      parseResult = p.getParse(content);
+    if (parseResult != null && !parseResult.isEmpty()) {
+      return parseResult;
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Unable to successfully parse content " + content.getUrl()
+            + " of type " + content.getContentType());
+      }
+      return new ParseStatus(new ParseException(
+          "Unable to successfully parse content")).getEmptyParseResult(
+          content.getUrl(), null);
+    }
+  }
+
+  private ParseResult runParser(Parser p, Content content) {
+    ParseCallable pc = new ParseCallable(p, content);
+    Future<ParseResult> task = executorService.submit(pc);
+    ParseResult res = null;
+    try {
+      res = task.get(maxParseTime, TimeUnit.SECONDS);
+    } catch (Exception e) {
+      LOG.warn("Error parsing " + content.getUrl() + " with " + p, e);
+      task.cancel(true);
+    } finally {
+      pc = null;
+    }
+    return res;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
new file mode 100644
index 0000000..d101453
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * A parser for content generated by a
+ * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is
+ * implemented by extensions. Nutch's core contains no page parsing code.
+ */
+public interface Parser extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = Parser.class.getName();
+
+  /**
+   * <p>
+   * This method parses the given content and returns a map of &lt;key,
+   * parse&gt; pairs. {@link Parse} instances will be persisted under the given
+   * key.
+   * </p>
+   * <p>
+   * Note: Meta-redirects should be followed only when they are coming from the
+   * original URL. That is: <br>
+   * Assume fetcher is in parsing mode and is currently processing
+   * foo.bar.com/redirect.html. If this url contains a meta redirect to another
+   * url, fetcher should only follow the redirect if the map contains an entry
+   * of the form &lt;"foo.bar.com/redirect.html", {@link Parse} with a
+   * {@link ParseStatus} indicating the redirect&gt;.
+   * </p>
+   * 
+   * @param c
+   *          Content to be parsed
+   * @return a map containing &lt;key, parse&gt; pairs
+   * @since NUTCH-443
+   */
+  ParseResult getParse(Content c);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
new file mode 100644
index 0000000..7e5b146
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.StringUtil;
+
+/**
+ * Parser checker, useful for testing parser. It also accurately reports
+ * possible fetching and parsing failures and presents protocol status signals
+ * to aid debugging. The tool enables us to retrieve the following data from any
+ * url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
+ * type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and
+ * is used to remove duplicates during the dedup procedure. It is calculated
+ * using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
+ * <i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
+ * on <code>content.length</code> configuration.</li>
+ * </ol>
+ * 
+ * @author John Xing
+ */
+
+public class ParserChecker implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParserChecker.class);
+  private Configuration conf;
+
+  public ParserChecker() {
+  }
+
+  public int run(String[] args) throws Exception {
+    boolean dumpText = false;
+    boolean force = false;
+    String contentType = null;
+    String url = null;
+
+    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
+
+    if (args.length == 0) {
+      LOG.error(usage);
+      return (-1);
+    }
+
+    // used to simulate the metadata propagated from injection
+    HashMap<String, String> metadata = new HashMap<String, String>();
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-forceAs")) {
+        force = true;
+        contentType = args[++i];
+      } else if (args[i].equals("-dumpText")) {
+        dumpText = true;
+      } else if (args[i].equals("-md")) {
+        String k = null, v = null;
+        String nextOne = args[++i];
+        int firstEquals = nextOne.indexOf("=");
+        if (firstEquals != -1) {
+          k = nextOne.substring(0, firstEquals);
+          v = nextOne.substring(firstEquals + 1);
+        } else
+          k = nextOne;
+        metadata.put(k, v);
+      } else if (i != args.length - 1) {
+        LOG.error(usage);
+        System.exit(-1);
+      } else {
+        url = URLUtil.toASCII(args[i]);
+      }
+    }
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("fetching: " + url);
+    }
+
+    CrawlDatum cd = new CrawlDatum();
+
+    Iterator<String> iter = metadata.keySet().iterator();
+    while (iter.hasNext()) {
+      String key = iter.next();
+      String value = metadata.get(key);
+      if (value == null)
+        value = "";
+      cd.getMetaData().put(new Text(key), new Text(value));
+    }
+
+    ProtocolFactory factory = new ProtocolFactory(conf);
+    Protocol protocol = factory.getProtocol(url);
+    Text turl = new Text(url);
+    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
+
+    // If the configuration permits, handle redirects until we either run
+    // out of allowed redirects or we stop getting redirect statuses.
+    int maxRedirects = conf.getInt("http.redirect.max", 0);
+    int numRedirects = 0;
+    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
+        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
+        LOG.info("Handling redirect to " + newURL);
+
+        protocol = factory.getProtocol(newURL);
+        turl = new Text(newURL);
+        output = protocol.getProtocolOutput(turl, cd);
+
+        numRedirects++;
+    }
+
+    if (!output.getStatus().isSuccess()) {
+      System.err.println("Fetch failed with protocol status: "
+          + output.getStatus());
+
+      if (output.getStatus().isRedirect()) {
+          System.err.println("Redirect(s) not handled due to configuration.");
+          System.err.println("Max Redirects to handle per config: " + maxRedirects);
+          System.err.println("Number of Redirects handled: " + numRedirects);
+      }
+      return (-1);
+    }
+
+    Content content = output.getContent();
+
+    if (content == null) {
+      LOG.error("No content for " + url);
+      return (-1);
+    }
+
+    if (force) {
+      content.setContentType(contentType);
+    } else {
+      contentType = content.getContentType();
+    }
+
+    if (contentType == null) {
+      LOG.error("Failed to determine content type!");
+      return (-1);
+    }
+
+    if (ParseSegment.isTruncated(content)) {
+      LOG.warn("Content is truncated, parse may fail!");
+    }
+
+    ScoringFilters scfilters = new ScoringFilters(conf);
+    // call the scoring filters
+    try {
+      scfilters.passScoreBeforeParsing(turl, cd, content);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
+            + ")");
+        LOG.warn(StringUtils.stringifyException(e));
+      }
+    }
+
+    ParseResult parseResult = new ParseUtil(conf).parse(content);
+
+    if (parseResult == null) {
+      LOG.error("Parsing content failed!");
+      return (-1);
+    }
+
+    // Calculate the signature
+    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+        content, parseResult.get(new Text(url)));
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("parsing: " + url);
+      LOG.info("contentType: " + contentType);
+      LOG.info("signature: " + StringUtil.toHexString(signature));
+    }
+
+    Parse parse = parseResult.get(turl);
+    if (parse == null) {
+      LOG.error("Failed to get parse from parse result");
+      LOG.error("Available parses in parse result (by URL key):");
+      for (Map.Entry<Text, Parse> entry : parseResult) {
+        LOG.error("  " + entry.getKey());
+      }
+      LOG.error("Parse result does not contain a parse for URL to be checked:");
+      LOG.error("  " + turl);
+      return -1;
+    }
+
+    // call the scoring filters
+    try {
+      scfilters.passScoreAfterParsing(turl, content, parse);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e
+            + ")");
+        LOG.warn(StringUtils.stringifyException(e));
+      }
+    }
+
+    for (Map.Entry<Text, Parse> entry : parseResult) {
+      parse = entry.getValue();
+      LOG.info("---------\nUrl\n---------------\n");
+      System.out.print(entry.getKey());
+      LOG.info("\n---------\nParseData\n---------\n");
+      System.out.print(parse.getData().toString());
+      if (dumpText) {
+        LOG.info("---------\nParseText\n---------\n");
+        System.out.print(parse.getText());
+      }
+    }
+
+    return 0;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration c) {
+    conf = c;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(),
+        args);
+    System.exit(res);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
new file mode 100644
index 0000000..0982de4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
@@ -0,0 +1,428 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.ObjectCache;
+
+/** Creates and caches {@link Parser} plugins. */
+public final class ParserFactory {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class);
+
+  /** Wildcard for default plugins. */
+  public static final String DEFAULT_PLUGIN = "*";
+
+  /** Empty extension list for caching purposes. */
+  private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+      .<Extension> emptyList();
+
+  private Configuration conf;
+  private ExtensionPoint extensionPoint;
+  private ParsePluginList parsePluginList;
+
+  public ParserFactory(Configuration conf) {
+    this.conf = conf;
+    ObjectCache objectCache = ObjectCache.get(conf);
+    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+        Parser.X_POINT_ID);
+    this.parsePluginList = (ParsePluginList) objectCache
+        .getObject(ParsePluginList.class.getName());
+
+    if (this.parsePluginList == null) {
+      this.parsePluginList = new ParsePluginsReader().parse(conf);
+      objectCache.setObject(ParsePluginList.class.getName(),
+          this.parsePluginList);
+    }
+
+    if (this.extensionPoint == null) {
+      throw new RuntimeException("x point " + Parser.X_POINT_ID + " not found.");
+    }
+    if (this.parsePluginList == null) {
+      throw new RuntimeException(
+          "Parse Plugins preferences could not be loaded.");
+    }
+  }
+
+  /**
+   * Function returns an array of {@link Parser}s for a given content type.
+   * 
+   * The function consults the internal list of parse plugins for the
+   * ParserFactory to determine the list of pluginIds, then gets the appropriate
+   * extension points to instantiate as {@link Parser}s.
+   * 
+   * @param contentType
+   *          The contentType to return the <code>Array</code> of {@link Parser}
+   *          s for.
+   * @param url
+   *          The url for the content that may allow us to get the type from the
+   *          file suffix.
+   * @return An <code>Array</code> of {@link Parser}s for the given contentType.
+   *         If there were plugins mapped to a contentType via the
+   *         <code>parse-plugins.xml</code> file, but never enabled via the
+   *         <code>plugin.includes</code> Nutch conf, then those plugins won't
+   *         be part of this array, i.e., they will be skipped. So, if the
+   *         ordered list of parsing plugins for <code>text/plain</code> was
+   *         <code>[parse-text,parse-html,
+   *         parse-rtf]</code>, and only <code>parse-html</code> and
+   *         <code>parse-rtf</code> were enabled via
+   *         <code>plugin.includes</code>, then this ordered Array would consist
+   *         of two {@link Parser} interfaces,
+   *         <code>[parse-html, parse-rtf]</code>.
+   */
+  public Parser[] getParsers(String contentType, String url)
+      throws ParserNotFound {
+
+    List<Parser> parsers = null;
+    List<Extension> parserExts = null;
+
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    // TODO once the MimeTypes is available
+    // parsers = getExtensions(MimeUtils.map(contentType));
+    // if (parsers != null) {
+    // return parsers;
+    // }
+    // Last Chance: Guess content-type from file url...
+    // parsers = getExtensions(MimeUtils.getMimeType(url));
+
+    parserExts = getExtensions(contentType);
+    if (parserExts == null) {
+      throw new ParserNotFound(url, contentType);
+    }
+
+    parsers = new Vector<Parser>(parserExts.size());
+    for (Iterator<Extension> i = parserExts.iterator(); i.hasNext();) {
+      Extension ext = i.next();
+      Parser p = null;
+      try {
+        // check to see if we've cached this parser instance yet
+        p = (Parser) objectCache.getObject(ext.getId());
+        if (p == null) {
+          // go ahead and instantiate it and then cache it
+          p = (Parser) ext.getExtensionInstance();
+          objectCache.setObject(ext.getId(), p);
+        }
+        parsers.add(p);
+      } catch (PluginRuntimeException e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("ParserFactory:PluginRuntimeException when "
+              + "initializing parser plugin "
+              + ext.getDescriptor().getPluginId() + " instance in getParsers "
+              + "function: attempting to continue instantiating parsers");
+        }
+      }
+    }
+    return parsers.toArray(new Parser[] {});
+  }
+
+  /**
+   * Function returns a {@link Parser} instance with the specified
+   * <code>extId</code>, representing its extension ID. If the Parser instance
+   * isn't found, then the function throws a <code>ParserNotFound</code>
+   * exception. If the function is able to find the {@link Parser} in the
+   * internal <code>PARSER_CACHE</code> then it will return the already
+   * instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+   * , then this function will cache that Parser in the internal
+   * <code>PARSER_CACHE</code>.
+   * 
+   * @param id
+   *          The string extension ID (e.g.,
+   *          "org.apache.nutch.parse.rss.RSSParser",
+   *          "org.apache.nutch.parse.rtf.RTFParseFactory") of the
+   *          {@link Parser} implementation to return.
+   * @return A {@link Parser} implementation specified by the parameter
+   *         <code>id</code>.
+   * @throws ParserNotFound
+   *           If the Parser is not found (i.e., registered with the extension
+   *           point), or if the there a {@link PluginRuntimeException}
+   *           instantiating the {@link Parser}.
+   */
+  public Parser getParserById(String id) throws ParserNotFound {
+
+    Extension[] extensions = this.extensionPoint.getExtensions();
+    Extension parserExt = null;
+
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    if (id != null) {
+      parserExt = getExtension(extensions, id);
+    }
+    if (parserExt == null) {
+      parserExt = getExtensionFromAlias(extensions, id);
+    }
+
+    if (parserExt == null) {
+      throw new ParserNotFound("No Parser Found for id [" + id + "]");
+    }
+
+    // first check the cache
+    if (objectCache.getObject(parserExt.getId()) != null) {
+      return (Parser) objectCache.getObject(parserExt.getId());
+
+      // if not found in cache, instantiate the Parser
+    } else {
+      try {
+        Parser p = (Parser) parserExt.getExtensionInstance();
+        objectCache.setObject(parserExt.getId(), p);
+        return p;
+      } catch (PluginRuntimeException e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Canno initialize parser "
+              + parserExt.getDescriptor().getPluginId() + " (cause: "
+              + e.toString());
+        }
+        throw new ParserNotFound("Cannot init parser for id [" + id + "]");
+      }
+    }
+  }
+
+  /**
+   * Finds the best-suited parse plugin for a given contentType.
+   * 
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return a list of extensions to be used for this contentType. If none,
+   *         returns <code>null</code>.
+   */
+  @SuppressWarnings("unchecked")
+  protected List<Extension> getExtensions(String contentType) {
+
+    ObjectCache objectCache = ObjectCache.get(conf);
+    // First of all, tries to clean the content-type
+    String type = null;
+    type = MimeUtil.cleanMimeType(contentType);
+
+    List<Extension> extensions = (List<Extension>) objectCache.getObject(type);
+
+    // Just compare the reference:
+    // if this is the empty list, we know we will find no extension.
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return null;
+    }
+
+    if (extensions == null) {
+      extensions = findExtensions(type);
+      if (extensions != null) {
+        objectCache.setObject(type, extensions);
+      } else {
+        // Put the empty extension list into cache
+        // to remember we don't know any related extension.
+        objectCache.setObject(type, EMPTY_EXTENSION_LIST);
+      }
+    }
+    return extensions;
+  }
+
+  /**
+   * searches a list of suitable parse plugins for the given contentType.
+   * <p>
+   * It first looks for a preferred plugin defined in the parse-plugin file. If
+   * none is found, it returns a list of default plugins.
+   * 
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If none,
+   *         returns null.
+   */
+  private List<Extension> findExtensions(String contentType) {
+
+    Extension[] extensions = this.extensionPoint.getExtensions();
+
+    // Look for a preferred plugin.
+    List<String> parsePluginList = this.parsePluginList
+        .getPluginList(contentType);
+    List<Extension> extensionList = matchExtensions(parsePluginList,
+        extensions, contentType);
+    if (extensionList != null) {
+      return extensionList;
+    }
+
+    // If none found, look for a default plugin.
+    parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN);
+    return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
+  }
+
+  /**
+   * Tries to find a suitable parser for the given contentType.
+   * <ol>
+   * <li>It checks if a parser which accepts the contentType can be found in the
+   * <code>plugins</code> list;</li>
+   * <li>If this list is empty, it tries to find amongst the loaded extensions
+   * whether some of them might suit and warns the user.</li>
+   * </ol>
+   * 
+   * @param plugins
+   *          List of candidate plugins.
+   * @param extensions
+   *          Array of loaded extensions.
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If none,
+   *         returns null.
+   */
+  private List<Extension> matchExtensions(List<String> plugins,
+      Extension[] extensions, String contentType) {
+
+    List<Extension> extList = new ArrayList<Extension>();
+    if (plugins != null) {
+
+      for (String parsePluginId : plugins) {
+
+        Extension ext = getExtension(extensions, parsePluginId, contentType);
+        // the extension returned may be null
+        // that means that it was not enabled in the plugin.includes
+        // nutch conf property, but it was mapped in the
+        // parse-plugins.xml
+        // file.
+        // OR it was enabled in plugin.includes, but the plugin's plugin.xml
+        // file does not claim that the plugin supports the specified mimeType
+        // in either case, LOG the appropriate error message to WARN level
+
+        if (ext == null) {
+          // try to get it just by its pluginId
+          ext = getExtension(extensions, parsePluginId);
+
+          if (LOG.isWarnEnabled()) {
+            if (ext != null) {
+              // plugin was enabled via plugin.includes
+              // its plugin.xml just doesn't claim to support that
+              // particular mimeType
+              LOG.warn("ParserFactory:Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but " + "its plugin.xml "
+                  + "file does not claim to support contentType: "
+                  + contentType);
+            } else {
+              // plugin wasn't enabled via plugin.includes
+              LOG.warn("ParserFactory: Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but not enabled via "
+                  + "plugin.includes in nutch-default.xml");
+            }
+          }
+        }
+
+        if (ext != null) {
+          // add it to the list
+          extList.add(ext);
+        }
+      }
+
+    } else {
+      // okay, there were no list of plugins defined for
+      // this mimeType, however, there may be plugins registered
+      // via the plugin.includes nutch conf property that claim
+      // via their plugin.xml file to support this contentType
+      // so, iterate through the list of extensions and if you find
+      // any extensions where this is the case, throw a
+      // NotMappedParserException
+
+      for (int i = 0; i < extensions.length; i++) {
+        if ("*".equals(extensions[i].getAttribute("contentType"))) {
+          extList.add(0, extensions[i]);
+        } else if (extensions[i].getAttribute("contentType") != null
+            && contentType.matches(escapeContentType(extensions[i]
+                .getAttribute("contentType")))) {
+          extList.add(extensions[i]);
+        }
+      }
+
+      if (extList.size() > 0) {
+        if (LOG.isInfoEnabled()) {
+          StringBuffer extensionsIDs = new StringBuffer("[");
+          boolean isFirst = true;
+          for (Extension ext : extList) {
+            if (!isFirst)
+              extensionsIDs.append(" - ");
+            else
+              isFirst = false;
+            extensionsIDs.append(ext.getId());
+          }
+          extensionsIDs.append("]");
+          LOG.info("The parsing plugins: " + extensionsIDs.toString()
+              + " are enabled via the plugin.includes system "
+              + "property, and all claim to support the content type "
+              + contentType + ", but they are not mapped to it  in the "
+              + "parse-plugins.xml file");
+        }
+      } else if (LOG.isDebugEnabled()) {
+        LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+            + "contentType " + contentType);
+      }
+    }
+
+    return (extList.size() > 0) ? extList : null;
+  }
+
+  private String escapeContentType(String contentType) {
+    // Escapes contentType in order to use as a regex
+    // (and keep backwards compatibility).
+    // This enables to accept multiple types for a single parser.
+    return contentType.replace("+", "\\+").replace(".", "\\.");
+  }
+
+  private boolean match(Extension extension, String id, String type) {
+    return ((id.equals(extension.getId())) && (extension.getAttribute(
+        "contentType").equals("*")
+        || type
+            .matches(escapeContentType(extension.getAttribute("contentType"))) || type
+          .equals(DEFAULT_PLUGIN)));
+  }
+
+  /** Get an extension from its id and supported content-type. */
+  private Extension getExtension(Extension[] list, String id, String type) {
+    for (int i = 0; i < list.length; i++) {
+      if (match(list[i], id, type)) {
+        return list[i];
+      }
+    }
+    return null;
+  }
+
+  private Extension getExtension(Extension[] list, String id) {
+    for (int i = 0; i < list.length; i++) {
+      if (id.equals(list[i].getId())) {
+        return list[i];
+      }
+    }
+    return null;
+  }
+
+  private Extension getExtensionFromAlias(Extension[] list, String id) {
+    return getExtension(list, parsePluginList.getAliases().get(id));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
new file mode 100644
index 0000000..2857efa
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+public class ParserNotFound extends ParseException {
+
+  private static final long serialVersionUID = 23993993939L;
+  private String url;
+  private String contentType;
+
+  public ParserNotFound(String message) {
+    super(message);
+  }
+
+  public ParserNotFound(String url, String contentType) {
+    this(url, contentType, "parser not found for contentType=" + contentType
+        + " url=" + url);
+  }
+
+  public ParserNotFound(String url, String contentType, String message) {
+    super(message);
+    this.url = url;
+    this.contentType = contentType;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+
+  public String getContentType() {
+    return contentType;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
new file mode 100644
index 0000000..40bd3e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The {@link org.apache.nutch.parse.Parse Parse} interface and related classes.
+ */
+package org.apache.nutch.parse;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
new file mode 100644
index 0000000..f50c11a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * <code>CircularDependencyException</code> will be thrown if a circular
+ * dependency is detected.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class CircularDependencyException extends Exception {
+
+  private static final long serialVersionUID = 1L;
+
+  public CircularDependencyException(Throwable cause) {
+    super(cause);
+  }
+
+  public CircularDependencyException(String message) {
+    super(message);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
new file mode 100644
index 0000000..b0ee0af
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * An <code>Extension</code> is a kind of listener descriptor that will be
+ * installed on a concrete <code>ExtensionPoint</code> that acts as kind of
+ * Publisher.
+ */
+public class Extension {
+  private PluginDescriptor fDescriptor;
+  private String fId;
+  private String fTargetPoint;
+  private String fClazz;
+  private HashMap<String, String> fAttributes;
+  private Configuration conf;
+
+  /**
+   * @param pDescriptor
+   *          a plugin descriptor
+   * @param pExtensionPoint
+   *          an extension porin
+   * @param pId
+   *          an unique id of the plugin
+   */
+  public Extension(PluginDescriptor pDescriptor, String pExtensionPoint,
+      String pId, String pExtensionClass, Configuration conf,
+      PluginRepository pluginRepository) {
+    fAttributes = new HashMap<String, String>();
+    setDescriptor(pDescriptor);
+    setExtensionPoint(pExtensionPoint);
+    setId(pId);
+    setClazz(pExtensionClass);
+    this.conf = conf;
+  }
+
+  /**
+   * @param point
+   */
+  private void setExtensionPoint(String point) {
+    fTargetPoint = point;
+  }
+
+  /**
+   * Returns a attribute value, that is setuped in the manifest file and is
+   * definied by the extension point xml schema.
+   * 
+   * @param pKey
+   *          a key
+   * @return String a value
+   */
+  public String getAttribute(String pKey) {
+    return fAttributes.get(pKey);
+  }
+
+  /**
+   * Returns the full class name of the extension point implementation
+   * 
+   * @return String
+   */
+  public String getClazz() {
+    return fClazz;
+  }
+
+  /**
+   * Return the unique id of the extension.
+   * 
+   * @return String
+   */
+  public String getId() {
+    return fId;
+  }
+
+  /**
+   * Adds a attribute and is only used until model creation at plugin system
+   * start up.
+   * 
+   * @param pKey
+   *          a key
+   * @param pValue
+   *          a value
+   */
+  public void addAttribute(String pKey, String pValue) {
+    fAttributes.put(pKey, pValue);
+  }
+
+  /**
+   * Sets the Class that implement the concret extension and is only used until
+   * model creation at system start up.
+   * 
+   * @param extensionClazz
+   *          The extensionClasname to set
+   */
+  public void setClazz(String extensionClazz) {
+    fClazz = extensionClazz;
+  }
+
+  /**
+   * Sets the unique extension Id and is only used until model creation at
+   * system start up.
+   * 
+   * @param extensionID
+   *          The extensionID to set
+   */
+  public void setId(String extensionID) {
+    fId = extensionID;
+  }
+
+  /**
+   * Returns the Id of the extension point, that is implemented by this
+   * extension.
+   */
+  public String getTargetPoint() {
+    return fTargetPoint;
+  }
+
+  /**
+   * Return an instance of the extension implementatio. Before we create a
+   * extension instance we startup the plugin if it is not already done. The
+   * plugin instance and the extension instance use the same
+   * <code>PluginClassLoader</code>. Each Plugin use its own classloader. The
+   * PluginClassLoader knows only own <i>Plugin runtime libraries </i> setuped
+   * in the plugin manifest file and exported libraries of the depenedend
+   * plugins.
+   * 
+   * @return Object An instance of the extension implementation
+   */
+  public Object getExtensionInstance() throws PluginRuntimeException {
+    // Must synchronize here to make sure creation and initialization
+    // of a plugin instance and it extension instance are done by
+    // one and only one thread.
+    // The same is in PluginRepository.getPluginInstance().
+    // Suggested by Stefan Groschupf <sg...@media-style.com>
+    synchronized (getId()) {
+      try {
+        PluginRepository pluginRepository = PluginRepository.get(conf);
+        Class<?> extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+            getClazz());
+        // lazy loading of Plugin in case there is no instance of the plugin
+        // already.
+        pluginRepository.getPluginInstance(getDescriptor());
+        Object object = extensionClazz.newInstance();
+        if (object instanceof Configurable) {
+          ((Configurable) object).setConf(this.conf);
+        }
+        return object;
+      } catch (ClassNotFoundException e) {
+        throw new PluginRuntimeException(e);
+      } catch (InstantiationException e) {
+        throw new PluginRuntimeException(e);
+      } catch (IllegalAccessException e) {
+        throw new PluginRuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * return the plugin descriptor.
+   * 
+   * @return PluginDescriptor
+   */
+  public PluginDescriptor getDescriptor() {
+    return fDescriptor;
+  }
+
+  /**
+   * Sets the plugin descriptor and is only used until model creation at system
+   * start up.
+   * 
+   * @param pDescriptor
+   */
+  public void setDescriptor(PluginDescriptor pDescriptor) {
+    fDescriptor = pDescriptor;
+  }
+}