You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:58 UTC
[42/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
new file mode 100644
index 0000000..daf96e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -0,0 +1,278 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * A reader to load the information stored in the
+ * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+class ParsePluginsReader {
+
+ /* our log stream */
+ public static final Logger LOG = LoggerFactory
+ .getLogger(ParsePluginsReader.class);
+
+ /** The property name of the parse-plugins location */
+ private static final String PP_FILE_PROP = "parse.plugin.file";
+
+ /** the parse-plugins file */
+ private String fParsePluginsFile = null;
+
+ /**
+ * Constructs a new ParsePluginsReader
+ */
+ public ParsePluginsReader() {
+ }
+
+ /**
+ * Reads the <code>parse-plugins.xml</code> file and returns the
+ * {@link #ParsePluginList} defined by it.
+ *
+ * @return A {@link #ParsePluginList} specified by the
+ * <code>parse-plugins.xml</code> file.
+ * @throws Exception
+ * If any parsing error occurs.
+ */
+ public ParsePluginList parse(Configuration conf) {
+
+ ParsePluginList pList = new ParsePluginList();
+
+ // open up the XML file
+ DocumentBuilderFactory factory = null;
+ DocumentBuilder parser = null;
+ Document document = null;
+ InputSource inputSource = null;
+
+ InputStream ppInputStream = null;
+ if (fParsePluginsFile != null) {
+ URL parsePluginUrl = null;
+ try {
+ parsePluginUrl = new URL(fParsePluginsFile);
+ ppInputStream = parsePluginUrl.openStream();
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Unable to load parse plugins file from URL " + "["
+ + fParsePluginsFile + "]. Reason is [" + e + "]");
+ }
+ return pList;
+ }
+ } else {
+ ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
+ }
+
+ inputSource = new InputSource(ppInputStream);
+
+ try {
+ factory = DocumentBuilderFactory.newInstance();
+ parser = factory.newDocumentBuilder();
+ document = parser.parse(inputSource);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is ["
+ + e + "]");
+ }
+ return null;
+ }
+
+ Element parsePlugins = document.getDocumentElement();
+
+ // build up the alias hash map
+ Map<String, String> aliases = getAliases(parsePlugins);
+ // And store it on the parse plugin list
+ pList.setAliases(aliases);
+
+ // get all the mime type nodes
+ NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
+
+ // iterate through the mime types
+ for (int i = 0; i < mimeTypes.getLength(); i++) {
+ Element mimeType = (Element) mimeTypes.item(i);
+ String mimeTypeStr = mimeType.getAttribute("name");
+
+ // for each mimeType, get the plugin list
+ NodeList pluginList = mimeType.getElementsByTagName("plugin");
+
+ // iterate through the plugins, add them in order read
+ // OR if they have a special order="" attribute, then hold those in
+ // a separate list, and then insert them into the final list at the
+ // order specified
+ if (pluginList != null && pluginList.getLength() > 0) {
+ List<String> plugList = new ArrayList<String>(pluginList.getLength());
+
+ for (int j = 0; j < pluginList.getLength(); j++) {
+ Element plugin = (Element) pluginList.item(j);
+ String pluginId = plugin.getAttribute("id");
+ String extId = aliases.get(pluginId);
+ if (extId == null) {
+ // Assume an extension id is directly specified
+ extId = pluginId;
+ }
+ String orderStr = plugin.getAttribute("order");
+ int order = -1;
+ try {
+ order = Integer.parseInt(orderStr);
+ } catch (NumberFormatException ignore) {
+ }
+ if (order != -1) {
+ plugList.add(order - 1, extId);
+ } else {
+ plugList.add(extId);
+ }
+ }
+
+ // now add the plugin list and map it to this mimeType
+ pList.setPluginList(mimeTypeStr, plugList);
+
+ } else if (LOG.isWarnEnabled()) {
+ LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: "
+ + mimeTypeStr + ", continuing parse");
+ }
+ }
+ return pList;
+ }
+
+ /**
+ * Tests parsing of the parse-plugins.xml file. An alternative name for the
+ * file can be specified via the <code>--file</code> option, although the file
+ * must be located in the <code>$NUTCH_HOME/conf</code> directory.
+ *
+ * @param args
+ * Currently only the --file argument to specify an alternative name
+ * for the parse-plugins.xml file is supported.
+ */
+ public static void main(String[] args) throws Exception {
+ String parsePluginFile = null;
+ String usage = "ParsePluginsReader [--file <parse plugin file location>]";
+
+ if ((args.length != 0 && args.length != 2)
+ || (args.length == 2 && !"--file".equals(args[0]))) {
+ System.err.println(usage);
+ System.exit(1);
+ }
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("--file")) {
+ parsePluginFile = args[++i];
+ }
+ }
+
+ ParsePluginsReader reader = new ParsePluginsReader();
+
+ if (parsePluginFile != null) {
+ reader.setFParsePluginsFile(parsePluginFile);
+ }
+
+ ParsePluginList prefs = reader.parse(NutchConfiguration.create());
+
+ for (String mimeType : prefs.getSupportedMimeTypes()) {
+
+ System.out.println("MIMETYPE: " + mimeType);
+ List<String> plugList = prefs.getPluginList(mimeType);
+
+ System.out.println("EXTENSION IDs:");
+
+ for (String j : plugList) {
+ System.out.println(j);
+ }
+ }
+
+ }
+
+ /**
+ * @return Returns the fParsePluginsFile.
+ */
+ public String getFParsePluginsFile() {
+ return fParsePluginsFile;
+ }
+
+ /**
+ * @param parsePluginsFile
+ * The fParsePluginsFile to set.
+ */
+ public void setFParsePluginsFile(String parsePluginsFile) {
+ fParsePluginsFile = parsePluginsFile;
+ }
+
+ private Map<String, String> getAliases(Element parsePluginsRoot) {
+
+ Map<String, String> aliases = new HashMap<String, String>();
+ NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
+
+ if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("No aliases defined in parse-plugins.xml!");
+ }
+ return aliases;
+ }
+
+ if (aliasRoot.getLength() > 1) {
+ // log a warning, but try and continue processing
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml");
+ }
+ }
+
+ Element aliasRootElem = (Element) aliasRoot.item(0);
+ NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
+
+ if (aliasElements != null && aliasElements.getLength() > 0) {
+ for (int i = 0; i < aliasElements.getLength(); i++) {
+ Element aliasElem = (Element) aliasElements.item(i);
+ String parsePluginId = aliasElem.getAttribute("name");
+ String extensionId = aliasElem.getAttribute("extension-id");
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Found alias: plugin-id: " + parsePluginId
+ + ", extension-id: " + extensionId);
+ }
+ if (parsePluginId != null && extensionId != null) {
+ aliases.put(parsePluginId, extensionId);
+ }
+ }
+ }
+ return aliases;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
new file mode 100644
index 0000000..92d8871
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.Text;
+
+/**
+ * A utility class that stores result of a parse. Internally a ParseResult
+ * stores <{@link Text}, {@link Parse}> pairs.
+ * <p>
+ * Parsers may return multiple results, which correspond to parts or other
+ * associated documents related to the original URL.
+ * </p>
+ * <p>
+ * There will be usually one parse result that corresponds directly to the
+ * original URL, and possibly many (or none) results that correspond to derived
+ * URLs (or sub-URLs).
+ */
+public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
+ private Map<Text, Parse> parseMap;
+ private String originalUrl;
+
+ public static final Logger LOG = LoggerFactory.getLogger(ParseResult.class);
+
+ /**
+ * Create a container for parse results.
+ *
+ * @param originalUrl
+ * the original url from which all parse results have been obtained.
+ */
+ public ParseResult(String originalUrl) {
+ parseMap = new HashMap<Text, Parse>();
+ this.originalUrl = originalUrl;
+ }
+
+ /**
+ * Convenience method for obtaining {@link ParseResult} from a single
+ * <code>Parse</code> output.
+ *
+ * @param url
+ * canonical url.
+ * @param parse
+ * single parse output.
+ * @return result containing the single parse output.
+ */
+ public static ParseResult createParseResult(String url, Parse parse) {
+ ParseResult parseResult = new ParseResult(url);
+ parseResult.put(new Text(url), new ParseText(parse.getText()),
+ parse.getData());
+ return parseResult;
+ }
+
+ /**
+ * Checks whether the result is empty.
+ *
+ * @return
+ */
+ public boolean isEmpty() {
+ return parseMap.isEmpty();
+ }
+
+ /**
+ * Return the number of parse outputs (both successful and failed)
+ */
+ public int size() {
+ return parseMap.size();
+ }
+
+ /**
+ * Retrieve a single parse output.
+ *
+ * @param key
+ * sub-url under which the parse output is stored.
+ * @return parse output corresponding to this sub-url, or null.
+ */
+ public Parse get(String key) {
+ return get(new Text(key));
+ }
+
+ /**
+ * Retrieve a single parse output.
+ *
+ * @param key
+ * sub-url under which the parse output is stored.
+ * @return parse output corresponding to this sub-url, or null.
+ */
+ public Parse get(Text key) {
+ return parseMap.get(key);
+ }
+
+ /**
+ * Store a result of parsing.
+ *
+ * @param key
+ * URL or sub-url of this parse result
+ * @param text
+ * plain text result
+ * @param data
+ * corresponding parse metadata of this result
+ */
+ public void put(Text key, ParseText text, ParseData data) {
+ put(key.toString(), text, data);
+ }
+
+ /**
+ * Store a result of parsing.
+ *
+ * @param key
+ * URL or sub-url of this parse result
+ * @param text
+ * plain text result
+ * @param data
+ * corresponding parse metadata of this result
+ */
+ public void put(String key, ParseText text, ParseData data) {
+ parseMap.put(new Text(key),
+ new ParseImpl(text, data, key.equals(originalUrl)));
+ }
+
+ /**
+ * Iterate over all entries in the <url, Parse> map.
+ */
+ public Iterator<Entry<Text, Parse>> iterator() {
+ return parseMap.entrySet().iterator();
+ }
+
+ /**
+ * Remove all results where status is not successful (as determined by
+ * </code>ParseStatus#isSuccess()</code>). Note that effects of this operation
+ * cannot be reversed.
+ */
+ public void filter() {
+ for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+ Entry<Text, Parse> entry = i.next();
+ if (!entry.getValue().getData().getStatus().isSuccess()) {
+ LOG.warn(entry.getKey() + " is not parsed successfully, filtering");
+ i.remove();
+ }
+ }
+
+ }
+
+ /**
+ * A convenience method which returns true only if all parses are successful.
+ * Parse success is determined by <code>ParseStatus#isSuccess()</code>.
+ */
+ public boolean isSuccess() {
+ for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+ Entry<Text, Parse> entry = i.next();
+ if (!entry.getValue().getData().getStatus().isSuccess()) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
new file mode 100644
index 0000000..b008bed
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
@@ -0,0 +1,309 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.Path;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.Map.Entry;
+
+/* Parse content in a segment. */
+public class ParseSegment extends NutchTool implements Tool,
+ Mapper<WritableComparable<?>, Content, Text, ParseImpl>,
+ Reducer<Text, Writable, Text, Writable> {
+
+ public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
+
+ public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+
+ private ScoringFilters scfilters;
+
+ private ParseUtil parseUtil;
+
+ private boolean skipTruncated;
+
+ public ParseSegment() {
+ this(null);
+ }
+
+ public ParseSegment(Configuration conf) {
+ super(conf);
+ }
+
+ public void configure(JobConf job) {
+ setConf(job);
+ this.scfilters = new ScoringFilters(job);
+ skipTruncated = job.getBoolean(SKIP_TRUNCATED, true);
+ }
+
+ public void close() {
+ }
+
+ private Text newKey = new Text();
+
+ public void map(WritableComparable<?> key, Content content,
+ OutputCollector<Text, ParseImpl> output, Reporter reporter)
+ throws IOException {
+ // convert on the fly from old UTF8 keys
+ if (key instanceof Text) {
+ newKey.set(key.toString());
+ key = newKey;
+ }
+
+ int status = Integer.parseInt(content.getMetadata().get(
+ Nutch.FETCH_STATUS_KEY));
+ if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
+ // content not fetched successfully, skip document
+ LOG.debug("Skipping " + key + " as content is not fetched successfully");
+ return;
+ }
+
+ if (skipTruncated && isTruncated(content)) {
+ return;
+ }
+
+ long start = System.currentTimeMillis();
+ ParseResult parseResult = null;
+ try {
+ if (parseUtil == null)
+ parseUtil = new ParseUtil(getConf());
+ parseResult = parseUtil.parse(content);
+ } catch (Exception e) {
+ LOG.warn("Error parsing: " + key + ": "
+ + StringUtils.stringifyException(e));
+ return;
+ }
+
+ for (Entry<Text, Parse> entry : parseResult) {
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ reporter.incrCounter("ParserStatus",
+ ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
+
+ // pass segment name to parse data
+ parse.getData().getContentMeta()
+ .set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
+
+ // compute the new signature
+ byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+ content, parse);
+ parse.getData().getContentMeta()
+ .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (ScoringFilterException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Error passing score: " + url + ": " + e.getMessage());
+ }
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url);
+
+ output.collect(
+ url,
+ new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse
+ .isCanonical()));
+ }
+ }
+
+ /**
+ * Checks if the page's content is truncated.
+ *
+ * @param content
+ * @return If the page is truncated <code>true</code>. When it is not, or when
+ * it could be determined, <code>false</code>.
+ */
+ public static boolean isTruncated(Content content) {
+ byte[] contentBytes = content.getContent();
+ if (contentBytes == null)
+ return false;
+ Metadata metadata = content.getMetadata();
+ if (metadata == null)
+ return false;
+
+ String lengthStr = metadata.get(Response.CONTENT_LENGTH);
+ if (lengthStr != null)
+ lengthStr = lengthStr.trim();
+ if (StringUtil.isEmpty(lengthStr)) {
+ return false;
+ }
+ int inHeaderSize;
+ String url = content.getUrl();
+ try {
+ inHeaderSize = Integer.parseInt(lengthStr);
+ } catch (NumberFormatException e) {
+ LOG.warn("Wrong contentlength format for " + url, e);
+ return false;
+ }
+ int actualSize = contentBytes.length;
+ if (inHeaderSize > actualSize) {
+ LOG.info(url + " skipped. Content of size " + inHeaderSize
+ + " was truncated to " + actualSize);
+ return true;
+ }
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+ + inHeaderSize);
+ }
+ return false;
+ }
+
+ public void reduce(Text key, Iterator<Writable> values,
+ OutputCollector<Text, Writable> output, Reporter reporter)
+ throws IOException {
+ output.collect(key, values.next()); // collect first value
+ }
+
+ public void parse(Path segment) throws IOException {
+ if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+ LOG.warn("Segment: " + segment
+ + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
+ return;
+ }
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ if (LOG.isInfoEnabled()) {
+ LOG.info("ParseSegment: starting at " + sdf.format(start));
+ LOG.info("ParseSegment: segment: " + segment);
+ }
+
+ JobConf job = new NutchJob(getConf());
+ job.setJobName("parse " + segment);
+
+ FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+ job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setMapperClass(ParseSegment.class);
+ job.setReducerClass(ParseSegment.class);
+
+ FileOutputFormat.setOutputPath(job, segment);
+ job.setOutputFormat(ParseOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(ParseImpl.class);
+
+ JobClient.runJob(job);
+ long end = System.currentTimeMillis();
+ LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(),
+ args);
+ System.exit(res);
+ }
+
+ public int run(String[] args) throws Exception {
+ Path segment;
+
+ String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ if (args.length > 1) {
+ for (int i = 1; i < args.length; i++) {
+ String param = args[i];
+
+ if ("-nofilter".equalsIgnoreCase(param)) {
+ getConf().setBoolean("parse.filter.urls", false);
+ } else if ("-nonormalize".equalsIgnoreCase(param)) {
+ getConf().setBoolean("parse.normalize.urls", false);
+ }
+ }
+ }
+
+ segment = new Path(args[0]);
+ parse(segment);
+ return 0;
+ }
+
+ /*
+ * Used for Nutch REST service
+ */
+ public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ Path segment;
+ if(args.containsKey(Nutch.ARG_SEGMENT)) {
+ Object seg = args.get(Nutch.ARG_SEGMENT);
+ if(seg instanceof Path) {
+ segment = (Path) seg;
+ }
+ else {
+ segment = new Path(seg.toString());
+ }
+ }
+ else {
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ segment = new Path(segmentsList[0].getPath());
+ }
+
+ if (args.containsKey("nofilter")) {
+ getConf().setBoolean("parse.filter.urls", false);
+ }
+ if (args.containsKey("nonormalize")) {
+ getConf().setBoolean("parse.normalize.urls", false);
+ }
+ parse(segment);
+ results.put(Nutch.VAL_RESULT, Integer.toString(0));
+ return results;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
new file mode 100644
index 0000000..b9d5959
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki <ab@getopt.org>
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class ParseStatus implements Writable {
+
+ private final static byte VERSION = 2;
+
+ // Primary status codes:
+
+ /** Parsing was not performed. */
+ public static final byte NOTPARSED = 0;
+ /** Parsing succeeded. */
+ public static final byte SUCCESS = 1;
+ /** General failure. There may be a more specific error message in arguments. */
+ public static final byte FAILED = 2;
+
+ public static final String[] majorCodes = { "notparsed", "success", "failed" };
+
+ // Secondary success codes go here:
+
+ /**
+ * Parsed content contains a directive to redirect to another URL. The target
+ * URL can be retrieved from the arguments.
+ */
+ public static final short SUCCESS_REDIRECT = 100;
+
+ // Secondary failure codes go here:
+
+ /**
+ * Parsing failed. An Exception occured (which may be retrieved from the
+ * arguments).
+ */
+ public static final short FAILED_EXCEPTION = 200;
+ /**
+ * Parsing failed. Content was truncated, but the parser cannot handle
+ * incomplete content.
+ */
+ public static final short FAILED_TRUNCATED = 202;
+ /**
+ * Parsing failed. Invalid format - the content may be corrupted or of wrong
+ * type.
+ */
+ public static final short FAILED_INVALID_FORMAT = 203;
+ /**
+ * Parsing failed. Other related parts of the content are needed to complete
+ * parsing. The list of URLs to missing parts may be provided in arguments.
+ * The Fetcher may decide to fetch these parts at once, then put them into
+ * Content.metadata, and supply them for re-parsing.
+ */
+ public static final short FAILED_MISSING_PARTS = 204;
+ /**
+ * Parsing failed. There was no content to be parsed - probably caused by
+ * errors at protocol stage.
+ */
+ public static final short FAILED_MISSING_CONTENT = 205;
+
+ public static final ParseStatus STATUS_NOTPARSED = new ParseStatus(NOTPARSED);
+ public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
+ public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
+
+ private byte majorCode = 0;
+ private short minorCode = 0;
+ private String[] args = null;
+
+ public byte getVersion() {
+ return VERSION;
+ }
+
+ public ParseStatus() {
+
+ }
+
+ public ParseStatus(int majorCode, int minorCode, String[] args) {
+ this.args = args;
+ this.majorCode = (byte) majorCode;
+ this.minorCode = (short) minorCode;
+ }
+
+ public ParseStatus(int majorCode) {
+ this(majorCode, 0, (String[]) null);
+ }
+
+ public ParseStatus(int majorCode, String[] args) {
+ this(majorCode, 0, args);
+ }
+
+ public ParseStatus(int majorCode, int minorCode) {
+ this(majorCode, minorCode, (String[]) null);
+ }
+
+ /** Simplified constructor for passing just a text message. */
+ public ParseStatus(int majorCode, int minorCode, String message) {
+ this(majorCode, minorCode, new String[] { message });
+ }
+
+ /** Simplified constructor for passing just a text message. */
+ public ParseStatus(int majorCode, String message) {
+ this(majorCode, 0, new String[] { message });
+ }
+
+ public ParseStatus(Throwable t) {
+ this(FAILED, FAILED_EXCEPTION, new String[] { t.toString() });
+ }
+
+ public static ParseStatus read(DataInput in) throws IOException {
+ ParseStatus res = new ParseStatus();
+ res.readFields(in);
+ return res;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ byte version = in.readByte();
+ switch (version) {
+ case 1:
+ majorCode = in.readByte();
+ minorCode = in.readShort();
+ args = WritableUtils.readCompressedStringArray(in);
+ break;
+ case 2:
+ majorCode = in.readByte();
+ minorCode = in.readShort();
+ args = WritableUtils.readStringArray(in);
+ break;
+ default:
+ throw new VersionMismatchException(VERSION, version);
+ }
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(VERSION);
+ out.writeByte(majorCode);
+ out.writeShort(minorCode);
+ if (args == null) {
+ out.writeInt(-1);
+ } else {
+ WritableUtils.writeStringArray(out, args);
+ }
+ }
+
+ /**
+ * A convenience method. Returns true if majorCode is SUCCESS, false
+ * otherwise.
+ */
+
+ public boolean isSuccess() {
+ return majorCode == SUCCESS;
+ }
+
+ /**
+ * A convenience method. Return a String representation of the first argument,
+ * or null.
+ */
+ public String getMessage() {
+ if (args != null && args.length > 0 && args[0] != null)
+ return args[0];
+ return null;
+ }
+
+ public String[] getArgs() {
+ return args;
+ }
+
+ public int getMajorCode() {
+ return majorCode;
+ }
+
+ public int getMinorCode() {
+ return minorCode;
+ }
+
+ /**
+ * A convenience method. Creates an empty Parse instance, which returns this
+ * status.
+ */
+ public Parse getEmptyParse(Configuration conf) {
+ return new EmptyParseImpl(this, conf);
+ }
+
+ /**
+ * A convenience method. Creates an empty ParseResult, which contains this
+ * status.
+ */
+ public ParseResult getEmptyParseResult(String url, Configuration conf) {
+ return ParseResult.createParseResult(url, getEmptyParse(conf));
+ }
+
+ public String toString() {
+ StringBuffer res = new StringBuffer();
+ String name = null;
+ if (majorCode >= 0 && majorCode < majorCodes.length)
+ name = majorCodes[majorCode];
+ else
+ name = "UNKNOWN!";
+ res.append(name + "(" + majorCode + "," + minorCode + ")");
+ if (args != null) {
+ if (args.length == 1) {
+ res.append(": " + String.valueOf(args[0]));
+ } else {
+ for (int i = 0; i < args.length; i++) {
+ if (args[i] != null)
+ res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+ }
+ }
+ }
+ return res.toString();
+ }
+
+ public void setArgs(String[] args) {
+ this.args = args;
+ }
+
+ public void setMessage(String msg) {
+ if (args == null || args.length == 0) {
+ args = new String[1];
+ }
+ args[0] = msg;
+ }
+
+ public void setMajorCode(byte majorCode) {
+ this.majorCode = majorCode;
+ }
+
+ public void setMinorCode(short minorCode) {
+ this.minorCode = minorCode;
+ }
+
+ public boolean equals(Object o) {
+ if (o == null)
+ return false;
+ if (!(o instanceof ParseStatus))
+ return false;
+ boolean res = true;
+ ParseStatus other = (ParseStatus) o;
+ res = res && (this.majorCode == other.majorCode)
+ && (this.minorCode == other.minorCode);
+ if (!res)
+ return res;
+ if (this.args == null) {
+ if (other.args == null)
+ return true;
+ else
+ return false;
+ } else {
+ if (other.args == null)
+ return false;
+ if (other.args.length != this.args.length)
+ return false;
+ for (int i = 0; i < this.args.length; i++) {
+ if (!this.args[i].equals(other.args[i]))
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static class EmptyParseImpl implements Parse {
+
+ private ParseData data = null;
+
+ public EmptyParseImpl(ParseStatus status, Configuration conf) {
+ data = new ParseData(status, "", new Outlink[0], new Metadata(),
+ new Metadata());
+ }
+
+ public ParseData getData() {
+ return data;
+ }
+
+ public String getText() {
+ return "";
+ }
+
+ public boolean isCanonical() {
+ return true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
new file mode 100644
index 0000000..13416cf
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.commons.cli.Options;
+import org.apache.nutch.util.NutchConfiguration;
+
+/* The text conversion of page's content, stored using gzip compression.
+ * @see Parse#getText()
+ */
+public final class ParseText implements Writable {
+ public static final String DIR_NAME = "parse_text";
+
+ private final static byte VERSION = 2;
+
+ public ParseText() {
+ }
+
+ private String text;
+
+ public ParseText(String text) {
+ this.text = text;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ byte version = in.readByte();
+ switch (version) {
+ case 1:
+ text = WritableUtils.readCompressedString(in);
+ break;
+ case VERSION:
+ text = Text.readString(in);
+ break;
+ default:
+ throw new VersionMismatchException(VERSION, version);
+ }
+ }
+
+ public final void write(DataOutput out) throws IOException {
+ out.write(VERSION);
+ Text.writeString(out, text);
+ }
+
+ public final static ParseText read(DataInput in) throws IOException {
+ ParseText parseText = new ParseText();
+ parseText.readFields(in);
+ return parseText;
+ }
+
+ //
+ // Accessor methods
+ //
+ public String getText() {
+ return text;
+ }
+
+ public boolean equals(Object o) {
+ if (!(o instanceof ParseText))
+ return false;
+ ParseText other = (ParseText) o;
+ return this.text.equals(other.text);
+ }
+
+ public String toString() {
+ return text;
+ }
+
+ public static void main(String argv[]) throws Exception {
+ String usage = "ParseText (-local | -dfs <namenode:port>) recno segment";
+
+ if (argv.length < 3) {
+ System.out.println("usage:" + usage);
+ return;
+ }
+ Options opts = new Options();
+ Configuration conf = NutchConfiguration.create();
+
+ GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
+ String[] remainingArgs = parser.getRemainingArgs();
+
+ FileSystem fs = FileSystem.get(conf);
+ try {
+ int recno = Integer.parseInt(remainingArgs[0]);
+ String segment = remainingArgs[1];
+ String filename = new Path(segment, ParseText.DIR_NAME).toString();
+
+ ParseText parseText = new ParseText();
+ ArrayFile.Reader parseTexts = new ArrayFile.Reader(fs, filename, conf);
+
+ parseTexts.get(recno, parseText);
+ System.out.println("Retrieved " + recno + " from file " + filename);
+ System.out.println(parseText);
+ parseTexts.close();
+ } finally {
+ fs.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
new file mode 100644
index 0000000..39024dc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// Commons Logging imports
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.protocol.Content;
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
+/**
+ * A Utility class containing methods to simply perform parsing utilities such
+ * as iterating through a preferred list of {@link Parser}s to obtain
+ * {@link Parse} objects.
+ *
+ * @author mattmann
+ * @author Jérôme Charron
+ * @author Sébastien Le Callonnec
+ */
+public class ParseUtil {
+
+ /* our log stream */
+ public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
+ private ParserFactory parserFactory;
+ /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
+ private int maxParseTime = 30;
+ private ExecutorService executorService;
+
+ /**
+ *
+ * @param conf
+ */
+ public ParseUtil(Configuration conf) {
+ this.parserFactory = new ParserFactory(conf);
+ maxParseTime = conf.getInt("parser.timeout", 30);
+ executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
+ .setNameFormat("parse-%d").setDaemon(true).build());
+ }
+
+ /**
+ * Performs a parse by iterating through a List of preferred {@link Parser}s
+ * until a successful parse is performed and a {@link Parse} object is
+ * returned. If the parse is unsuccessful, a message is logged to the
+ * <code>WARNING</code> level, and an empty parse is returned.
+ *
+ * @param content
+ * The content to try and parse.
+ * @return <key, {@link Parse}> pairs.
+ * @throws ParseException
+ * If no suitable parser is found to perform the parse.
+ */
+ public ParseResult parse(Content content) throws ParseException {
+ Parser[] parsers = null;
+
+ try {
+ parsers = this.parserFactory.getParsers(content.getContentType(),
+ content.getUrl() != null ? content.getUrl() : "");
+ } catch (ParserNotFound e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("No suitable parser found when trying to parse content "
+ + content.getUrl() + " of type " + content.getContentType());
+ }
+ throw new ParseException(e.getMessage());
+ }
+
+ ParseResult parseResult = null;
+ for (int i = 0; i < parsers.length; i++) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i]
+ + "]");
+ }
+ if (maxParseTime != -1)
+ parseResult = runParser(parsers[i], content);
+ else
+ parseResult = parsers[i].getParse(content);
+
+ if (parseResult != null && !parseResult.isEmpty())
+ return parseResult;
+ }
+
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Unable to successfully parse content " + content.getUrl()
+ + " of type " + content.getContentType());
+ }
+ return new ParseStatus(new ParseException(
+ "Unable to successfully parse content")).getEmptyParseResult(
+ content.getUrl(), null);
+ }
+
+ /**
+ * Method parses a {@link Content} object using the {@link Parser} specified
+ * by the parameter <code>extId</code>, i.e., the Parser's extension ID. If a
+ * suitable {@link Parser} is not found, then a <code>WARNING</code> level
+ * message is logged, and a ParseException is thrown. If the parse is
+ * uncessful for any other reason, then a <code>WARNING</code> level message
+ * is logged, and a <code>ParseStatus.getEmptyParse()</code> is returned.
+ *
+ * @param extId
+ * The extension implementation ID of the {@link Parser} to use to
+ * parse the specified content.
+ * @param content
+ * The content to parse.
+ *
+ * @return <key, {@link Parse}> pairs if the parse is successful,
+ * otherwise, a single <key,
+ * <code>ParseStatus.getEmptyParse()</code>> pair.
+ *
+ * @throws ParseException
+ * If there is no suitable {@link Parser} found to perform the
+ * parse.
+ */
+ public ParseResult parseByExtensionId(String extId, Content content)
+ throws ParseException {
+ Parser p = null;
+
+ try {
+ p = this.parserFactory.getParserById(extId);
+ } catch (ParserNotFound e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("No suitable parser found when trying to parse content "
+ + content.getUrl() + " of type " + content.getContentType());
+ }
+ throw new ParseException(e.getMessage());
+ }
+
+ ParseResult parseResult = null;
+ if (maxParseTime != -1)
+ parseResult = runParser(p, content);
+ else
+ parseResult = p.getParse(content);
+ if (parseResult != null && !parseResult.isEmpty()) {
+ return parseResult;
+ } else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Unable to successfully parse content " + content.getUrl()
+ + " of type " + content.getContentType());
+ }
+ return new ParseStatus(new ParseException(
+ "Unable to successfully parse content")).getEmptyParseResult(
+ content.getUrl(), null);
+ }
+ }
+
+ private ParseResult runParser(Parser p, Content content) {
+ ParseCallable pc = new ParseCallable(p, content);
+ Future<ParseResult> task = executorService.submit(pc);
+ ParseResult res = null;
+ try {
+ res = task.get(maxParseTime, TimeUnit.SECONDS);
+ } catch (Exception e) {
+ LOG.warn("Error parsing " + content.getUrl() + " with " + p, e);
+ task.cancel(true);
+ } finally {
+ pc = null;
+ }
+ return res;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
new file mode 100644
index 0000000..d101453
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * A parser for content generated by a
+ * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is
+ * implemented by extensions. Nutch's core contains no page parsing code.
+ */
+public interface Parser extends Pluggable, Configurable {
+ /** The name of the extension point. */
+ public final static String X_POINT_ID = Parser.class.getName();
+
+ /**
+ * <p>
+ * This method parses the given content and returns a map of <key,
+ * parse> pairs. {@link Parse} instances will be persisted under the given
+ * key.
+ * </p>
+ * <p>
+ * Note: Meta-redirects should be followed only when they are coming from the
+ * original URL. That is: <br>
+ * Assume fetcher is in parsing mode and is currently processing
+ * foo.bar.com/redirect.html. If this url contains a meta redirect to another
+ * url, fetcher should only follow the redirect if the map contains an entry
+ * of the form <"foo.bar.com/redirect.html", {@link Parse} with a
+ * {@link ParseStatus} indicating the redirect>.
+ * </p>
+ *
+ * @param c
+ * Content to be parsed
+ * @return a map containing <key, parse> pairs
+ * @since NUTCH-443
+ */
+ ParseResult getParse(Content c);
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
new file mode 100644
index 0000000..7e5b146
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.StringUtil;
+
+/**
+ * Parser checker, useful for testing parser. It also accurately reports
+ * possible fetching and parsing failures and presents protocol status signals
+ * to aid debugging. The tool enables us to retrieve the following data from any
+ * url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
+ * type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and
+ * is used to remove duplicates during the dedup procedure. It is calculated
+ * using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
+ * <i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
+ * on <code>content.length</code> configuration.</li>
+ * </ol>
+ *
+ * @author John Xing
+ */
+
+public class ParserChecker implements Tool {
+
+ public static final Logger LOG = LoggerFactory.getLogger(ParserChecker.class);
+ private Configuration conf;
+
+ public ParserChecker() {
+ }
+
+ public int run(String[] args) throws Exception {
+ boolean dumpText = false;
+ boolean force = false;
+ String contentType = null;
+ String url = null;
+
+ String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
+
+ if (args.length == 0) {
+ LOG.error(usage);
+ return (-1);
+ }
+
+ // used to simulate the metadata propagated from injection
+ HashMap<String, String> metadata = new HashMap<String, String>();
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-forceAs")) {
+ force = true;
+ contentType = args[++i];
+ } else if (args[i].equals("-dumpText")) {
+ dumpText = true;
+ } else if (args[i].equals("-md")) {
+ String k = null, v = null;
+ String nextOne = args[++i];
+ int firstEquals = nextOne.indexOf("=");
+ if (firstEquals != -1) {
+ k = nextOne.substring(0, firstEquals);
+ v = nextOne.substring(firstEquals + 1);
+ } else
+ k = nextOne;
+ metadata.put(k, v);
+ } else if (i != args.length - 1) {
+ LOG.error(usage);
+ System.exit(-1);
+ } else {
+ url = URLUtil.toASCII(args[i]);
+ }
+ }
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("fetching: " + url);
+ }
+
+ CrawlDatum cd = new CrawlDatum();
+
+ Iterator<String> iter = metadata.keySet().iterator();
+ while (iter.hasNext()) {
+ String key = iter.next();
+ String value = metadata.get(key);
+ if (value == null)
+ value = "";
+ cd.getMetaData().put(new Text(key), new Text(value));
+ }
+
+ ProtocolFactory factory = new ProtocolFactory(conf);
+ Protocol protocol = factory.getProtocol(url);
+ Text turl = new Text(url);
+ ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
+
+ // If the configuration permits, handle redirects until we either run
+ // out of allowed redirects or we stop getting redirect statuses.
+ int maxRedirects = conf.getInt("http.redirect.max", 0);
+ int numRedirects = 0;
+ while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
+ String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
+ LOG.info("Handling redirect to " + newURL);
+
+ protocol = factory.getProtocol(newURL);
+ turl = new Text(newURL);
+ output = protocol.getProtocolOutput(turl, cd);
+
+ numRedirects++;
+ }
+
+ if (!output.getStatus().isSuccess()) {
+ System.err.println("Fetch failed with protocol status: "
+ + output.getStatus());
+
+ if (output.getStatus().isRedirect()) {
+ System.err.println("Redirect(s) not handled due to configuration.");
+ System.err.println("Max Redirects to handle per config: " + maxRedirects);
+ System.err.println("Number of Redirects handled: " + numRedirects);
+ }
+ return (-1);
+ }
+
+ Content content = output.getContent();
+
+ if (content == null) {
+ LOG.error("No content for " + url);
+ return (-1);
+ }
+
+ if (force) {
+ content.setContentType(contentType);
+ } else {
+ contentType = content.getContentType();
+ }
+
+ if (contentType == null) {
+ LOG.error("Failed to determine content type!");
+ return (-1);
+ }
+
+ if (ParseSegment.isTruncated(content)) {
+ LOG.warn("Content is truncated, parse may fail!");
+ }
+
+ ScoringFilters scfilters = new ScoringFilters(conf);
+ // call the scoring filters
+ try {
+ scfilters.passScoreBeforeParsing(turl, cd, content);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
+ + ")");
+ LOG.warn(StringUtils.stringifyException(e));
+ }
+ }
+
+ ParseResult parseResult = new ParseUtil(conf).parse(content);
+
+ if (parseResult == null) {
+ LOG.error("Parsing content failed!");
+ return (-1);
+ }
+
+ // Calculate the signature
+ byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+ content, parseResult.get(new Text(url)));
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("parsing: " + url);
+ LOG.info("contentType: " + contentType);
+ LOG.info("signature: " + StringUtil.toHexString(signature));
+ }
+
+ Parse parse = parseResult.get(turl);
+ if (parse == null) {
+ LOG.error("Failed to get parse from parse result");
+ LOG.error("Available parses in parse result (by URL key):");
+ for (Map.Entry<Text, Parse> entry : parseResult) {
+ LOG.error(" " + entry.getKey());
+ }
+ LOG.error("Parse result does not contain a parse for URL to be checked:");
+ LOG.error(" " + turl);
+ return -1;
+ }
+
+ // call the scoring filters
+ try {
+ scfilters.passScoreAfterParsing(turl, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e
+ + ")");
+ LOG.warn(StringUtils.stringifyException(e));
+ }
+ }
+
+ for (Map.Entry<Text, Parse> entry : parseResult) {
+ parse = entry.getValue();
+ LOG.info("---------\nUrl\n---------------\n");
+ System.out.print(entry.getKey());
+ LOG.info("\n---------\nParseData\n---------\n");
+ System.out.print(parse.getData().toString());
+ if (dumpText) {
+ LOG.info("---------\nParseText\n---------\n");
+ System.out.print(parse.getText());
+ }
+ }
+
+ return 0;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration c) {
+ conf = c;
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(),
+ args);
+ System.exit(res);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
new file mode 100644
index 0000000..0982de4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
@@ -0,0 +1,428 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.ObjectCache;
+
+/** Creates and caches {@link Parser} plugins. */
+public final class ParserFactory {
+
+ public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class);
+
+ /** Wildcard for default plugins. */
+ public static final String DEFAULT_PLUGIN = "*";
+
+ /** Empty extension list for caching purposes. */
+ private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+ .<Extension> emptyList();
+
+ private Configuration conf;
+ private ExtensionPoint extensionPoint;
+ private ParsePluginList parsePluginList;
+
+ public ParserFactory(Configuration conf) {
+ this.conf = conf;
+ ObjectCache objectCache = ObjectCache.get(conf);
+ this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+ Parser.X_POINT_ID);
+ this.parsePluginList = (ParsePluginList) objectCache
+ .getObject(ParsePluginList.class.getName());
+
+ if (this.parsePluginList == null) {
+ this.parsePluginList = new ParsePluginsReader().parse(conf);
+ objectCache.setObject(ParsePluginList.class.getName(),
+ this.parsePluginList);
+ }
+
+ if (this.extensionPoint == null) {
+ throw new RuntimeException("x point " + Parser.X_POINT_ID + " not found.");
+ }
+ if (this.parsePluginList == null) {
+ throw new RuntimeException(
+ "Parse Plugins preferences could not be loaded.");
+ }
+ }
+
+ /**
+ * Function returns an array of {@link Parser}s for a given content type.
+ *
+ * The function consults the internal list of parse plugins for the
+ * ParserFactory to determine the list of pluginIds, then gets the appropriate
+ * extension points to instantiate as {@link Parser}s.
+ *
+ * @param contentType
+ * The contentType to return the <code>Array</code> of {@link Parser}
+ * s for.
+ * @param url
+ * The url for the content that may allow us to get the type from the
+ * file suffix.
+ * @return An <code>Array</code> of {@link Parser}s for the given contentType.
+ * If there were plugins mapped to a contentType via the
+ * <code>parse-plugins.xml</code> file, but never enabled via the
+ * <code>plugin.includes</code> Nutch conf, then those plugins won't
+ * be part of this array, i.e., they will be skipped. So, if the
+ * ordered list of parsing plugins for <code>text/plain</code> was
+ * <code>[parse-text,parse-html,
+ * parse-rtf]</code>, and only <code>parse-html</code> and
+ * <code>parse-rtf</code> were enabled via
+ * <code>plugin.includes</code>, then this ordered Array would consist
+ * of two {@link Parser} interfaces,
+ * <code>[parse-html, parse-rtf]</code>.
+ */
+ public Parser[] getParsers(String contentType, String url)
+ throws ParserNotFound {
+
+ List<Parser> parsers = null;
+ List<Extension> parserExts = null;
+
+ ObjectCache objectCache = ObjectCache.get(conf);
+
+ // TODO once the MimeTypes is available
+ // parsers = getExtensions(MimeUtils.map(contentType));
+ // if (parsers != null) {
+ // return parsers;
+ // }
+ // Last Chance: Guess content-type from file url...
+ // parsers = getExtensions(MimeUtils.getMimeType(url));
+
+ parserExts = getExtensions(contentType);
+ if (parserExts == null) {
+ throw new ParserNotFound(url, contentType);
+ }
+
+ parsers = new Vector<Parser>(parserExts.size());
+ for (Iterator<Extension> i = parserExts.iterator(); i.hasNext();) {
+ Extension ext = i.next();
+ Parser p = null;
+ try {
+ // check to see if we've cached this parser instance yet
+ p = (Parser) objectCache.getObject(ext.getId());
+ if (p == null) {
+ // go ahead and instantiate it and then cache it
+ p = (Parser) ext.getExtensionInstance();
+ objectCache.setObject(ext.getId(), p);
+ }
+ parsers.add(p);
+ } catch (PluginRuntimeException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("ParserFactory:PluginRuntimeException when "
+ + "initializing parser plugin "
+ + ext.getDescriptor().getPluginId() + " instance in getParsers "
+ + "function: attempting to continue instantiating parsers");
+ }
+ }
+ }
+ return parsers.toArray(new Parser[] {});
+ }
+
+ /**
+ * Function returns a {@link Parser} instance with the specified
+ * <code>extId</code>, representing its extension ID. If the Parser instance
+ * isn't found, then the function throws a <code>ParserNotFound</code>
+ * exception. If the function is able to find the {@link Parser} in the
+ * internal <code>PARSER_CACHE</code> then it will return the already
+ * instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+ * , then this function will cache that Parser in the internal
+ * <code>PARSER_CACHE</code>.
+ *
+ * @param id
+ * The string extension ID (e.g.,
+ * "org.apache.nutch.parse.rss.RSSParser",
+ * "org.apache.nutch.parse.rtf.RTFParseFactory") of the
+ * {@link Parser} implementation to return.
+ * @return A {@link Parser} implementation specified by the parameter
+ * <code>id</code>.
+ * @throws ParserNotFound
+ * If the Parser is not found (i.e., registered with the extension
+ * point), or if the there a {@link PluginRuntimeException}
+ * instantiating the {@link Parser}.
+ */
+ public Parser getParserById(String id) throws ParserNotFound {
+
+ Extension[] extensions = this.extensionPoint.getExtensions();
+ Extension parserExt = null;
+
+ ObjectCache objectCache = ObjectCache.get(conf);
+
+ if (id != null) {
+ parserExt = getExtension(extensions, id);
+ }
+ if (parserExt == null) {
+ parserExt = getExtensionFromAlias(extensions, id);
+ }
+
+ if (parserExt == null) {
+ throw new ParserNotFound("No Parser Found for id [" + id + "]");
+ }
+
+ // first check the cache
+ if (objectCache.getObject(parserExt.getId()) != null) {
+ return (Parser) objectCache.getObject(parserExt.getId());
+
+ // if not found in cache, instantiate the Parser
+ } else {
+ try {
+ Parser p = (Parser) parserExt.getExtensionInstance();
+ objectCache.setObject(parserExt.getId(), p);
+ return p;
+ } catch (PluginRuntimeException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Canno initialize parser "
+ + parserExt.getDescriptor().getPluginId() + " (cause: "
+ + e.toString());
+ }
+ throw new ParserNotFound("Cannot init parser for id [" + id + "]");
+ }
+ }
+ }
+
+ /**
+ * Finds the best-suited parse plugin for a given contentType.
+ *
+ * @param contentType
+ * Content-Type for which we seek a parse plugin.
+ * @return a list of extensions to be used for this contentType. If none,
+ * returns <code>null</code>.
+ */
+ @SuppressWarnings("unchecked")
+ protected List<Extension> getExtensions(String contentType) {
+
+ ObjectCache objectCache = ObjectCache.get(conf);
+ // First of all, tries to clean the content-type
+ String type = null;
+ type = MimeUtil.cleanMimeType(contentType);
+
+ List<Extension> extensions = (List<Extension>) objectCache.getObject(type);
+
+ // Just compare the reference:
+ // if this is the empty list, we know we will find no extension.
+ if (extensions == EMPTY_EXTENSION_LIST) {
+ return null;
+ }
+
+ if (extensions == null) {
+ extensions = findExtensions(type);
+ if (extensions != null) {
+ objectCache.setObject(type, extensions);
+ } else {
+ // Put the empty extension list into cache
+ // to remember we don't know any related extension.
+ objectCache.setObject(type, EMPTY_EXTENSION_LIST);
+ }
+ }
+ return extensions;
+ }
+
+ /**
+ * searches a list of suitable parse plugins for the given contentType.
+ * <p>
+ * It first looks for a preferred plugin defined in the parse-plugin file. If
+ * none is found, it returns a list of default plugins.
+ *
+ * @param contentType
+ * Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType. If none,
+ * returns null.
+ */
+ private List<Extension> findExtensions(String contentType) {
+
+ Extension[] extensions = this.extensionPoint.getExtensions();
+
+ // Look for a preferred plugin.
+ List<String> parsePluginList = this.parsePluginList
+ .getPluginList(contentType);
+ List<Extension> extensionList = matchExtensions(parsePluginList,
+ extensions, contentType);
+ if (extensionList != null) {
+ return extensionList;
+ }
+
+ // If none found, look for a default plugin.
+ parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN);
+ return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
+ }
+
+ /**
+ * Tries to find a suitable parser for the given contentType.
+ * <ol>
+ * <li>It checks if a parser which accepts the contentType can be found in the
+ * <code>plugins</code> list;</li>
+ * <li>If this list is empty, it tries to find amongst the loaded extensions
+ * whether some of them might suit and warns the user.</li>
+ * </ol>
+ *
+ * @param plugins
+ * List of candidate plugins.
+ * @param extensions
+ * Array of loaded extensions.
+ * @param contentType
+ * Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType. If none,
+ * returns null.
+ */
+ private List<Extension> matchExtensions(List<String> plugins,
+ Extension[] extensions, String contentType) {
+
+ List<Extension> extList = new ArrayList<Extension>();
+ if (plugins != null) {
+
+ for (String parsePluginId : plugins) {
+
+ Extension ext = getExtension(extensions, parsePluginId, contentType);
+ // the extension returned may be null
+ // that means that it was not enabled in the plugin.includes
+ // nutch conf property, but it was mapped in the
+ // parse-plugins.xml
+ // file.
+ // OR it was enabled in plugin.includes, but the plugin's plugin.xml
+ // file does not claim that the plugin supports the specified mimeType
+ // in either case, LOG the appropriate error message to WARN level
+
+ if (ext == null) {
+ // try to get it just by its pluginId
+ ext = getExtension(extensions, parsePluginId);
+
+ if (LOG.isWarnEnabled()) {
+ if (ext != null) {
+ // plugin was enabled via plugin.includes
+ // its plugin.xml just doesn't claim to support that
+ // particular mimeType
+ LOG.warn("ParserFactory:Plugin: " + parsePluginId
+ + " mapped to contentType " + contentType
+ + " via parse-plugins.xml, but " + "its plugin.xml "
+ + "file does not claim to support contentType: "
+ + contentType);
+ } else {
+ // plugin wasn't enabled via plugin.includes
+ LOG.warn("ParserFactory: Plugin: " + parsePluginId
+ + " mapped to contentType " + contentType
+ + " via parse-plugins.xml, but not enabled via "
+ + "plugin.includes in nutch-default.xml");
+ }
+ }
+ }
+
+ if (ext != null) {
+ // add it to the list
+ extList.add(ext);
+ }
+ }
+
+ } else {
+ // okay, there were no list of plugins defined for
+ // this mimeType, however, there may be plugins registered
+ // via the plugin.includes nutch conf property that claim
+ // via their plugin.xml file to support this contentType
+ // so, iterate through the list of extensions and if you find
+ // any extensions where this is the case, throw a
+ // NotMappedParserException
+
+ for (int i = 0; i < extensions.length; i++) {
+ if ("*".equals(extensions[i].getAttribute("contentType"))) {
+ extList.add(0, extensions[i]);
+ } else if (extensions[i].getAttribute("contentType") != null
+ && contentType.matches(escapeContentType(extensions[i]
+ .getAttribute("contentType")))) {
+ extList.add(extensions[i]);
+ }
+ }
+
+ if (extList.size() > 0) {
+ if (LOG.isInfoEnabled()) {
+ StringBuffer extensionsIDs = new StringBuffer("[");
+ boolean isFirst = true;
+ for (Extension ext : extList) {
+ if (!isFirst)
+ extensionsIDs.append(" - ");
+ else
+ isFirst = false;
+ extensionsIDs.append(ext.getId());
+ }
+ extensionsIDs.append("]");
+ LOG.info("The parsing plugins: " + extensionsIDs.toString()
+ + " are enabled via the plugin.includes system "
+ + "property, and all claim to support the content type "
+ + contentType + ", but they are not mapped to it in the "
+ + "parse-plugins.xml file");
+ }
+ } else if (LOG.isDebugEnabled()) {
+ LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+ + "contentType " + contentType);
+ }
+ }
+
+ return (extList.size() > 0) ? extList : null;
+ }
+
+ private String escapeContentType(String contentType) {
+ // Escapes contentType in order to use as a regex
+ // (and keep backwards compatibility).
+ // This enables to accept multiple types for a single parser.
+ return contentType.replace("+", "\\+").replace(".", "\\.");
+ }
+
+ private boolean match(Extension extension, String id, String type) {
+ return ((id.equals(extension.getId())) && (extension.getAttribute(
+ "contentType").equals("*")
+ || type
+ .matches(escapeContentType(extension.getAttribute("contentType"))) || type
+ .equals(DEFAULT_PLUGIN)));
+ }
+
+ /** Get an extension from its id and supported content-type. */
+ private Extension getExtension(Extension[] list, String id, String type) {
+ for (int i = 0; i < list.length; i++) {
+ if (match(list[i], id, type)) {
+ return list[i];
+ }
+ }
+ return null;
+ }
+
+ private Extension getExtension(Extension[] list, String id) {
+ for (int i = 0; i < list.length; i++) {
+ if (id.equals(list[i].getId())) {
+ return list[i];
+ }
+ }
+ return null;
+ }
+
+ private Extension getExtensionFromAlias(Extension[] list, String id) {
+ return getExtension(list, parsePluginList.getAliases().get(id));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
new file mode 100644
index 0000000..2857efa
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+public class ParserNotFound extends ParseException {
+
+ private static final long serialVersionUID = 23993993939L;
+ private String url;
+ private String contentType;
+
+ public ParserNotFound(String message) {
+ super(message);
+ }
+
+ public ParserNotFound(String url, String contentType) {
+ this(url, contentType, "parser not found for contentType=" + contentType
+ + " url=" + url);
+ }
+
+ public ParserNotFound(String url, String contentType, String message) {
+ super(message);
+ this.url = url;
+ this.contentType = contentType;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public String getContentType() {
+ return contentType;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
new file mode 100644
index 0000000..40bd3e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The {@link org.apache.nutch.parse.Parse Parse} interface and related classes.
+ */
+package org.apache.nutch.parse;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
new file mode 100644
index 0000000..f50c11a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * <code>CircularDependencyException</code> will be thrown if a circular
+ * dependency is detected.
+ *
+ * @author Jérôme Charron
+ */
+public class CircularDependencyException extends Exception {
+
+ private static final long serialVersionUID = 1L;
+
+ public CircularDependencyException(Throwable cause) {
+ super(cause);
+ }
+
+ public CircularDependencyException(String message) {
+ super(message);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
new file mode 100644
index 0000000..b0ee0af
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * An <code>Extension</code> is a kind of listener descriptor that will be
+ * installed on a concrete <code>ExtensionPoint</code> that acts as kind of
+ * Publisher.
+ */
+public class Extension {
+ private PluginDescriptor fDescriptor;
+ private String fId;
+ private String fTargetPoint;
+ private String fClazz;
+ private HashMap<String, String> fAttributes;
+ private Configuration conf;
+
+ /**
+ * @param pDescriptor
+ * a plugin descriptor
+ * @param pExtensionPoint
+ * an extension porin
+ * @param pId
+ * an unique id of the plugin
+ */
+ public Extension(PluginDescriptor pDescriptor, String pExtensionPoint,
+ String pId, String pExtensionClass, Configuration conf,
+ PluginRepository pluginRepository) {
+ fAttributes = new HashMap<String, String>();
+ setDescriptor(pDescriptor);
+ setExtensionPoint(pExtensionPoint);
+ setId(pId);
+ setClazz(pExtensionClass);
+ this.conf = conf;
+ }
+
+ /**
+ * @param point
+ */
+ private void setExtensionPoint(String point) {
+ fTargetPoint = point;
+ }
+
+ /**
+ * Returns a attribute value, that is setuped in the manifest file and is
+ * definied by the extension point xml schema.
+ *
+ * @param pKey
+ * a key
+ * @return String a value
+ */
+ public String getAttribute(String pKey) {
+ return fAttributes.get(pKey);
+ }
+
+ /**
+ * Returns the full class name of the extension point implementation
+ *
+ * @return String
+ */
+ public String getClazz() {
+ return fClazz;
+ }
+
+ /**
+ * Return the unique id of the extension.
+ *
+ * @return String
+ */
+ public String getId() {
+ return fId;
+ }
+
+ /**
+ * Adds a attribute and is only used until model creation at plugin system
+ * start up.
+ *
+ * @param pKey
+ * a key
+ * @param pValue
+ * a value
+ */
+ public void addAttribute(String pKey, String pValue) {
+ fAttributes.put(pKey, pValue);
+ }
+
+ /**
+ * Sets the Class that implement the concret extension and is only used until
+ * model creation at system start up.
+ *
+ * @param extensionClazz
+ * The extensionClasname to set
+ */
+ public void setClazz(String extensionClazz) {
+ fClazz = extensionClazz;
+ }
+
+ /**
+ * Sets the unique extension Id and is only used until model creation at
+ * system start up.
+ *
+ * @param extensionID
+ * The extensionID to set
+ */
+ public void setId(String extensionID) {
+ fId = extensionID;
+ }
+
+ /**
+ * Returns the Id of the extension point, that is implemented by this
+ * extension.
+ */
+ public String getTargetPoint() {
+ return fTargetPoint;
+ }
+
+ /**
+ * Return an instance of the extension implementatio. Before we create a
+ * extension instance we startup the plugin if it is not already done. The
+ * plugin instance and the extension instance use the same
+ * <code>PluginClassLoader</code>. Each Plugin use its own classloader. The
+ * PluginClassLoader knows only own <i>Plugin runtime libraries </i> setuped
+ * in the plugin manifest file and exported libraries of the depenedend
+ * plugins.
+ *
+ * @return Object An instance of the extension implementation
+ */
+ public Object getExtensionInstance() throws PluginRuntimeException {
+ // Must synchronize here to make sure creation and initialization
+ // of a plugin instance and it extension instance are done by
+ // one and only one thread.
+ // The same is in PluginRepository.getPluginInstance().
+ // Suggested by Stefan Groschupf <sg...@media-style.com>
+ synchronized (getId()) {
+ try {
+ PluginRepository pluginRepository = PluginRepository.get(conf);
+ Class<?> extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+ getClazz());
+ // lazy loading of Plugin in case there is no instance of the plugin
+ // already.
+ pluginRepository.getPluginInstance(getDescriptor());
+ Object object = extensionClazz.newInstance();
+ if (object instanceof Configurable) {
+ ((Configurable) object).setConf(this.conf);
+ }
+ return object;
+ } catch (ClassNotFoundException e) {
+ throw new PluginRuntimeException(e);
+ } catch (InstantiationException e) {
+ throw new PluginRuntimeException(e);
+ } catch (IllegalAccessException e) {
+ throw new PluginRuntimeException(e);
+ }
+ }
+ }
+
+ /**
+ * return the plugin descriptor.
+ *
+ * @return PluginDescriptor
+ */
+ public PluginDescriptor getDescriptor() {
+ return fDescriptor;
+ }
+
+ /**
+ * Sets the plugin descriptor and is only used until model creation at system
+ * start up.
+ *
+ * @param pDescriptor
+ */
+ public void setDescriptor(PluginDescriptor pDescriptor) {
+ fDescriptor = pDescriptor;
+ }
+}