You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [5/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ sr...
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java Fri Jan 9 06:34:33 2015
@@ -22,44 +22,52 @@ public interface ParseStatusCodes {
// Primary status codes:
/** Parsing was not performed. */
- public static final byte NOTPARSED = 0;
+ public static final byte NOTPARSED = 0;
/** Parsing succeeded. */
- public static final byte SUCCESS = 1;
+ public static final byte SUCCESS = 1;
/** General failure. There may be a more specific error message in arguments. */
- public static final byte FAILED = 2;
+ public static final byte FAILED = 2;
- public static final String[] majorCodes = {
- "notparsed",
- "success",
- "failed"
- };
+ public static final String[] majorCodes = { "notparsed", "success", "failed" };
// Secondary success codes go here:
- public static final short SUCCESS_OK = 0;
+ public static final short SUCCESS_OK = 0;
- /** Parsed content contains a directive to redirect to another URL.
- * The target URL can be retrieved from the arguments.
+ /**
+ * Parsed content contains a directive to redirect to another URL. The target
+ * URL can be retrieved from the arguments.
*/
- public static final short SUCCESS_REDIRECT = 100;
+ public static final short SUCCESS_REDIRECT = 100;
// Secondary failure codes go here:
- /** Parsing failed. An Exception occured (which may be retrieved from the arguments). */
- public static final short FAILED_EXCEPTION = 200;
- /** Parsing failed. Content was truncated, but the parser cannot handle incomplete content. */
- public static final short FAILED_TRUNCATED = 202;
- /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
- public static final short FAILED_INVALID_FORMAT = 203;
- /** Parsing failed. Other related parts of the content are needed to complete
+ /**
+ * Parsing failed. An Exception occured (which may be retrieved from the
+ * arguments).
+ */
+ public static final short FAILED_EXCEPTION = 200;
+ /**
+ * Parsing failed. Content was truncated, but the parser cannot handle
+ * incomplete content.
+ */
+ public static final short FAILED_TRUNCATED = 202;
+ /**
+ * Parsing failed. Invalid format - the content may be corrupted or of wrong
+ * type.
+ */
+ public static final short FAILED_INVALID_FORMAT = 203;
+ /**
+ * Parsing failed. Other related parts of the content are needed to complete
* parsing. The list of URLs to missing parts may be provided in arguments.
* The Fetcher may decide to fetch these parts at once, then put them into
* Content.metadata, and supply them for re-parsing.
*/
- public static final short FAILED_MISSING_PARTS = 204;
- /** Parsing failed. There was no content to be parsed - probably caused
- * by errors at protocol stage.
+ public static final short FAILED_MISSING_PARTS = 204;
+ /**
+ * Parsing failed. There was no content to be parsed - probably caused by
+ * errors at protocol stage.
*/
- public static final short FAILED_MISSING_CONTENT = 205;
-
+ public static final short FAILED_MISSING_CONTENT = 205;
+
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java Fri Jan 9 06:34:33 2015
@@ -29,10 +29,10 @@ import java.util.List;
public class ParseStatusUtils {
public static ParseStatus STATUS_SUCCESS = ParseStatus.newBuilder().build();
- public static final HashMap<Short,String> minorCodes = new HashMap<Short,String>();
+ public static final HashMap<Short, String> minorCodes = new HashMap<Short, String>();
static {
- STATUS_SUCCESS.setMajorCode((int)ParseStatusCodes.SUCCESS);
+ STATUS_SUCCESS.setMajorCode((int) ParseStatusCodes.SUCCESS);
minorCodes.put(ParseStatusCodes.SUCCESS_OK, "ok");
minorCodes.put(ParseStatusCodes.SUCCESS_REDIRECT, "redirect");
minorCodes.put(ParseStatusCodes.FAILED_EXCEPTION, "exception");
@@ -49,8 +49,9 @@ public class ParseStatusUtils {
return status.getMajorCode() == ParseStatusCodes.SUCCESS;
}
- /** A convenience method. Return a String representation of the first
- * argument, or null.
+ /**
+ * A convenience method. Return a String representation of the first argument,
+ * or null.
*/
public static String getMessage(ParseStatus status) {
List<CharSequence> args = status.getArgs();
@@ -77,29 +78,30 @@ public class ParseStatusUtils {
public static Parse getEmptyParse(Exception e, Configuration conf) {
ParseStatus status = ParseStatus.newBuilder().build();
- status.setMajorCode((int)ParseStatusCodes.FAILED);
- status.setMinorCode((int)ParseStatusCodes.FAILED_EXCEPTION);
+ status.setMajorCode((int) ParseStatusCodes.FAILED);
+ status.setMinorCode((int) ParseStatusCodes.FAILED_EXCEPTION);
status.getArgs().add(new Utf8(e.toString()));
return new Parse("", "", new Outlink[0], status);
}
- public static Parse getEmptyParse(int minorCode, String message, Configuration conf) {
+ public static Parse getEmptyParse(int minorCode, String message,
+ Configuration conf) {
ParseStatus status = ParseStatus.newBuilder().build();
- status.setMajorCode((int)ParseStatusCodes.FAILED);
+ status.setMajorCode((int) ParseStatusCodes.FAILED);
status.setMinorCode(minorCode);
status.getArgs().add(new Utf8(message));
return new Parse("", "", new Outlink[0], status);
}
-
+
public static String toString(ParseStatus status) {
if (status == null) {
return "(null)";
}
StringBuilder sb = new StringBuilder();
- sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] +
- "/" + minorCodes.get(status.getMinorCode().shortValue()));
+ sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] + "/"
+ + minorCodes.get(status.getMinorCode().shortValue()));
sb.append(" (" + status.getMajorCode() + "/" + status.getMinorCode() + ")");
sb.append(", args=[");
List<CharSequence> args = status.getArgs();
@@ -107,7 +109,8 @@ public class ParseStatusUtils {
int i = 0;
Iterator<CharSequence> it = args.iterator();
while (it.hasNext()) {
- if (i > 0) sb.append(',');
+ if (i > 0)
+ sb.append(',');
sb.append(it.next());
i++;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Jan 9 06:34:33 2015
@@ -49,7 +49,7 @@ import java.util.concurrent.TimeUnit;
* A Utility class containing methods to simply perform parsing utilities such
* as iterating through a preferred list of {@link Parser}s to obtain
* {@link Parse} objects.
- *
+ *
* @author mattmann
* @author Jérôme Charron
* @author Sébastien Le Callonnec
@@ -60,7 +60,7 @@ public class ParseUtil extends Configure
public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
private static final int DEFAULT_MAX_PARSE_TIME = 30;
-
+
private Configuration conf;
private Signature sig;
private URLFilters filters;
@@ -71,9 +71,9 @@ public class ParseUtil extends Configure
/** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
private int maxParseTime;
private ExecutorService executorService;
-
+
/**
- *
+ *
* @param conf
*/
public ParseUtil(Configuration conf) {
@@ -90,15 +90,16 @@ public class ParseUtil extends Configure
public void setConf(Configuration conf) {
this.conf = conf;
parserFactory = new ParserFactory(conf);
- maxParseTime=conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
+ maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
sig = SignatureFactory.getSignature(conf);
filters = new URLFilters(conf);
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
- maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
+ maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
+ : maxOutlinksPerPage;
ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
- .setNameFormat("parse-%d").setDaemon(true).build());
+ .setNameFormat("parse-%d").setDaemon(true).build());
}
/**
@@ -106,11 +107,13 @@ public class ParseUtil extends Configure
* until a successful parse is performed and a {@link Parse} object is
* returned. If the parse is unsuccessful, a message is logged to the
* <code>WARNING</code> level, and an empty parse is returned.
- *
- * @throws ParserNotFound If there is no suitable parser found.
- * @throws ParseException If there is an error parsing.
+ *
+ * @throws ParserNotFound
+ * If there is no suitable parser found.
+ * @throws ParseException
+ * If there is an error parsing.
*/
- public Parse parse(String url, WebPage page) throws ParserNotFound,
+ public Parse parse(String url, WebPage page) throws ParserNotFound,
ParseException {
Parser[] parsers = null;
@@ -118,28 +121,29 @@ public class ParseUtil extends Configure
parsers = this.parserFactory.getParsers(contentType, url);
- for (int i=0; i<parsers.length; i++) {
+ for (int i = 0; i < parsers.length; i++) {
if (LOG.isDebugEnabled()) {
LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]");
}
Parse parse = null;
-
- if (maxParseTime!=-1)
- parse = runParser(parsers[i], url, page);
- else
- parse = parsers[i].getParse(url, page);
-
- if (parse!=null && ParseStatusUtils.isSuccess(parse.getParseStatus())) {
+
+ if (maxParseTime != -1)
+ parse = runParser(parsers[i], url, page);
+ else
+ parse = parsers[i].getParse(url, page);
+
+ if (parse != null && ParseStatusUtils.isSuccess(parse.getParseStatus())) {
return parse;
}
}
- LOG.warn("Unable to successfully parse content " + url +
- " of type " + contentType);
- return ParseStatusUtils.getEmptyParse(new ParseException("Unable to successfully parse content"), null);
+ LOG.warn("Unable to successfully parse content " + url + " of type "
+ + contentType);
+ return ParseStatusUtils.getEmptyParse(new ParseException(
+ "Unable to successfully parse content"), null);
}
-
- private Parse runParser(Parser p, String url, WebPage page) {
+
+ private Parse runParser(Parser p, String url, WebPage page) {
ParseCallable pc = new ParseCallable(p, page, url);
Future<Parse> task = executorService.submit(pc);
Parse res = null;
@@ -155,8 +159,9 @@ public class ParseUtil extends Configure
}
/**
- * Parses given web page and stores parsed content within page. Puts
- * a meta-redirect to outlinks.
+ * Parses given web page and stores parsed content within page. Puts a
+ * meta-redirect to outlinks.
+ *
* @param key
* @param page
*/
@@ -165,7 +170,8 @@ public class ParseUtil extends Configure
byte status = page.getStatus().byteValue();
if (status != CrawlStatus.STATUS_FETCHED) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status));
+ LOG.debug("Skipping " + url + " as status is: "
+ + CrawlStatus.getName(status));
}
return;
}
@@ -213,7 +219,8 @@ public class ParseUtil extends Configure
return;
}
page.getOutlinks().put(new Utf8(newUrl), new Utf8());
- page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
+ page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
+ TableUtil.YES_VAL);
if (newUrl == null || newUrl.equals(url)) {
String reprUrl = URLUtil.chooseRepr(url, newUrl,
refreshTime < FetcherJob.PERM_REFRESH_TIME);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java Fri Jan 9 06:34:33 2015
@@ -22,9 +22,10 @@ import org.apache.hadoop.conf.Configurab
import org.apache.nutch.plugin.FieldPluggable;
import org.apache.nutch.storage.WebPage;
-/** A parser for content generated by a {@link org.apache.nutch.protocol.Protocol}
- * implementation. This interface is implemented by extensions. Nutch's core
- * contains no page parsing code.
+/**
+ * A parser for content generated by a
+ * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is
+ * implemented by extensions. Nutch's core contains no page parsing code.
*/
public interface Parser extends FieldPluggable, Configurable {
/** The name of the extension point. */
@@ -34,8 +35,9 @@ public interface Parser extends FieldPlu
* <p>
* This method parses content in WebPage instance
* </p>
- *
- * @param url Page's URL
+ *
+ * @param url
+ * Page's URL
* @param page
*/
Parse getParse(String url, WebPage page);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Fri Jan 9 06:34:33 2015
@@ -37,28 +37,30 @@ import java.util.Map;
import java.util.Map.Entry;
/**
- * Parser checker, useful for testing parser.
- * It also accurately reports possible fetching and
- * parsing failures and presents protocol status signals to aid
- * debugging. The tool enables us to retrieve the following data from
- * any url:
+ * Parser checker, useful for testing parser. It also accurately reports
+ * possible fetching and parsing failures and presents protocol status signals
+ * to aid debugging. The tool enables us to retrieve the following data from any
+ * url:
* <ol>
- * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} type.</li>
- * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and is used to remove
- * duplicates during the dedup procedure.
- * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
+ * type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and
+ * is used to remove duplicates during the dedup procedure. It is calculated
+ * using {@link org.apache.nutch.crawl.MD5Signature} or
* {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
* <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
* <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
* <li><tt>Title</tt>: of the URL</li>
* <li><tt>Outlinks</tt>: associated with the URL</li>
* <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
- * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, <i>Cache-Control</>, etc.</li>
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
+ * <i>Cache-Control</>, etc.</li>
* <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
* <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
- * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing on
- * <code>content.length</code> configuration.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
+ * on <code>content.length</code> configuration.</li>
* </ol>
+ *
* @author John Xing
*/
@@ -107,7 +109,7 @@ public class ParserChecker implements To
ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
- if(!protocolOutput.getStatus().isSuccess()) {
+ if (!protocolOutput.getStatus().isSuccess()) {
LOG.error("Fetch failed with protocol status: "
+ ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+ ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
@@ -155,7 +157,6 @@ public class ParserChecker implements To
LOG.info("signature: " + StringUtil.toHexString(signature));
}
-
LOG.info("---------\nUrl\n---------------\n");
System.out.print(url + "\n");
LOG.info("---------\nMetadata\n---------\n");
@@ -167,7 +168,7 @@ public class ParserChecker implements To
while (iterator.hasNext()) {
Entry<CharSequence, ByteBuffer> entry = iterator.next();
sb.append(entry.getKey().toString()).append(" : \t")
- .append(Bytes.toString(entry.getValue())).append("\n");
+ .append(Bytes.toString(entry.getValue())).append("\n");
}
System.out.print(sb.toString());
}
@@ -182,12 +183,12 @@ public class ParserChecker implements To
Map<CharSequence, CharSequence> headers = page.getHeaders();
StringBuffer headersb = new StringBuffer();
if (metadata != null) {
- Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet()
- .iterator();
+ Iterator<Entry<CharSequence, CharSequence>> iterator = headers
+ .entrySet().iterator();
while (iterator.hasNext()) {
Entry<CharSequence, CharSequence> entry = iterator.next();
headersb.append(entry.getKey().toString()).append(" : \t")
- .append(entry.getValue()).append("\n");
+ .append(entry.getValue()).append("\n");
}
System.out.print(headersb.toString());
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java Fri Jan 9 06:34:33 2015
@@ -34,8 +34,7 @@ import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.ObjectCache;
-
-/** Creates and caches {@link Parser} plugins.*/
+/** Creates and caches {@link Parser} plugins. */
public final class ParserFactory {
public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class);
@@ -44,8 +43,7 @@ public final class ParserFactory {
public static final String DEFAULT_PLUGIN = "*";
/** Empty extension list for caching purposes. */
- private final List<Extension> EMPTY_EXTENSION_LIST =
- new ArrayList<Extension>();
+ private final List<Extension> EMPTY_EXTENSION_LIST = new ArrayList<Extension>();
private final Configuration conf;
private final ExtensionPoint extensionPoint;
@@ -56,10 +54,12 @@ public final class ParserFactory {
ObjectCache objectCache = ObjectCache.get(conf);
this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
Parser.X_POINT_ID);
- this.parsePluginList = (ParsePluginList)objectCache.getObject(ParsePluginList.class.getName());
+ this.parsePluginList = (ParsePluginList) objectCache
+ .getObject(ParsePluginList.class.getName());
if (this.parsePluginList == null) {
this.parsePluginList = new ParsePluginsReader().parse(conf);
- objectCache.setObject(ParsePluginList.class.getName(), this.parsePluginList);
+ objectCache.setObject(ParsePluginList.class.getName(),
+ this.parsePluginList);
}
if (this.extensionPoint == null) {
@@ -71,33 +71,34 @@ public final class ParserFactory {
}
}
-
/**
* Function returns an array of {@link Parser}s for a given content type.
- *
+ *
* The function consults the internal list of parse plugins for the
- * ParserFactory to determine the list of pluginIds, then gets the
- * appropriate extension points to instantiate as {@link Parser}s.
- *
- * @param contentType The contentType to return the <code>Array</code>
- * of {@link Parser}s for.
- * @param url The url for the content that may allow us to get the type from
- * the file suffix.
+ * ParserFactory to determine the list of pluginIds, then gets the appropriate
+ * extension points to instantiate as {@link Parser}s.
+ *
+ * @param contentType
+ * The contentType to return the <code>Array</code> of {@link Parser}
+ * s for.
+ * @param url
+ * The url for the content that may allow us to get the type from the
+ * file suffix.
* @return An <code>Array</code> of {@link Parser}s for the given contentType.
* If there were plugins mapped to a contentType via the
- * <code>parse-plugins.xml</code> file, but never enabled via
- * the <code>plugin.includes</code> Nutch conf, then those plugins
- * won't be part of this array, i.e., they will be skipped.
- * So, if the ordered list of parsing plugins for
- * <code>text/plain</code> was <code>[parse-text,parse-html,
+ * <code>parse-plugins.xml</code> file, but never enabled via the
+ * <code>plugin.includes</code> Nutch conf, then those plugins won't
+ * be part of this array, i.e., they will be skipped. So, if the
+ * ordered list of parsing plugins for <code>text/plain</code> was
+ * <code>[parse-text,parse-html,
* parse-rtf]</code>, and only <code>parse-html</code> and
* <code>parse-rtf</code> were enabled via
- * <code>plugin.includes</code>, then this ordered Array would
- * consist of two {@link Parser} interfaces,
+ * <code>plugin.includes</code>, then this ordered Array would consist
+ * of two {@link Parser} interfaces,
* <code>[parse-html, parse-rtf]</code>.
*/
public Parser[] getParsers(String contentType, String url)
- throws ParserNotFound {
+ throws ParserNotFound {
List<Parser> parsers = null;
List<Extension> parserExts = null;
@@ -107,7 +108,7 @@ public final class ParserFactory {
// TODO once the MimeTypes is available
// parsers = getExtensions(MimeUtils.map(contentType));
// if (parsers != null) {
- // return parsers;
+ // return parsers;
// }
// Last Chance: Guess content-type from file url...
// parsers = getExtensions(MimeUtils.getMimeType(url));
@@ -118,49 +119,50 @@ public final class ParserFactory {
}
parsers = new ArrayList<Parser>(parserExts.size());
- for (Extension ext : parserExts){
+ for (Extension ext : parserExts) {
Parser p = null;
try {
- //check to see if we've cached this parser instance yet
+ // check to see if we've cached this parser instance yet
p = (Parser) objectCache.getObject(ext.getId());
if (p == null) {
// go ahead and instantiate it and then cache it
p = (Parser) ext.getExtensionInstance();
- objectCache.setObject(ext.getId(),p);
+ objectCache.setObject(ext.getId(), p);
}
parsers.add(p);
} catch (PluginRuntimeException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("ParserFactory:PluginRuntimeException when "
- + "initializing parser plugin "
- + ext.getDescriptor().getPluginId()
- + " instance in getParsers "
- + "function: attempting to continue instantiating parsers: ", e);
+ + "initializing parser plugin "
+ + ext.getDescriptor().getPluginId() + " instance in getParsers "
+ + "function: attempting to continue instantiating parsers: ", e);
}
}
}
- return parsers.toArray(new Parser[]{});
+ return parsers.toArray(new Parser[] {});
}
/**
* Function returns a {@link Parser} instance with the specified
- * <code>extId</code>, representing its extension ID. If the Parser
- * instance isn't found, then the function throws a
- * <code>ParserNotFound</code> exception. If the function is able to find
- * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it
- * will return the already instantiated Parser. Otherwise, if it has to
- * instantiate the Parser itself , then this function will cache that Parser
- * in the internal <code>PARSER_CACHE</code>.
- *
- * @param id The string extension ID (e.g.,
- * "org.apache.nutch.parse.rss.RSSParser",
- * "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser}
- * implementation to return.
+ * <code>extId</code>, representing its extension ID. If the Parser instance
+ * isn't found, then the function throws a <code>ParserNotFound</code>
+ * exception. If the function is able to find the {@link Parser} in the
+ * internal <code>PARSER_CACHE</code> then it will return the already
+ * instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+ * , then this function will cache that Parser in the internal
+ * <code>PARSER_CACHE</code>.
+ *
+ * @param id
+ * The string extension ID (e.g.,
+ * "org.apache.nutch.parse.rss.RSSParser",
+ * "org.apache.nutch.parse.rtf.RTFParseFactory") of the
+ * {@link Parser} implementation to return.
* @return A {@link Parser} implementation specified by the parameter
* <code>id</code>.
- * @throws ParserNotFound If the Parser is not found (i.e., registered with
- * the extension point), or if the there a
- * {@link PluginRuntimeException} instantiating the {@link Parser}.
+ * @throws ParserNotFound
+ * If the Parser is not found (i.e., registered with the extension
+ * point), or if the there a {@link PluginRuntimeException}
+ * instantiating the {@link Parser}.
*/
public Parser getParserById(String id) throws ParserNotFound {
@@ -184,7 +186,7 @@ public final class ParserFactory {
if (objectCache.getObject(parserExt.getId()) != null) {
return (Parser) objectCache.getObject(parserExt.getId());
- // if not found in cache, instantiate the Parser
+ // if not found in cache, instantiate the Parser
} else {
try {
Parser p = (Parser) parserExt.getExtensionInstance();
@@ -192,9 +194,9 @@ public final class ParserFactory {
return p;
} catch (PluginRuntimeException e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Canno initialize parser " +
- parserExt.getDescriptor().getPluginId() +
- " (cause: " + e.toString());
+ LOG.warn("Canno initialize parser "
+ + parserExt.getDescriptor().getPluginId() + " (cause: "
+ + e.toString());
}
throw new ParserNotFound("Cannot init parser for id [" + id + "]");
}
@@ -212,7 +214,7 @@ public final class ParserFactory {
columns.addAll(pluginFields);
}
} catch (PluginRuntimeException e) {
- LOG.error("PluginRuntimeException",e);
+ LOG.error("PluginRuntimeException", e);
}
}
return columns;
@@ -220,10 +222,11 @@ public final class ParserFactory {
/**
* Finds the best-suited parse plugin for a given contentType.
- *
- * @param contentType Content-Type for which we seek a parse plugin.
- * @return a list of extensions to be used for this contentType.
- * If none, returns <code>null</code>.
+ *
+ * @param contentType
+ * Content-Type for which we seek a parse plugin.
+ * @return a list of extensions to be used for this contentType. If none,
+ * returns <code>null</code>.
*/
@SuppressWarnings("unchecked")
protected List<Extension> getExtensions(String contentType) {
@@ -246,8 +249,8 @@ public final class ParserFactory {
if (extensions != null) {
objectCache.setObject(type, extensions);
} else {
- // Put the empty extension list into cache
- // to remember we don't know any related extension.
+ // Put the empty extension list into cache
+ // to remember we don't know any related extension.
objectCache.setObject(type, EMPTY_EXTENSION_LIST);
}
}
@@ -256,22 +259,24 @@ public final class ParserFactory {
/**
* searches a list of suitable parse plugins for the given contentType.
- * <p>It first looks for a preferred plugin defined in the parse-plugin
- * file. If none is found, it returns a list of default plugins.
- *
- * @param contentType Content-Type for which we seek a parse plugin.
- * @return List - List of extensions to be used for this contentType.
- * If none, returns null.
+ * <p>
+ * It first looks for a preferred plugin defined in the parse-plugin file. If
+ * none is found, it returns a list of default plugins.
+ *
+ * @param contentType
+ * Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType. If none,
+ * returns null.
*/
private List<Extension> findExtensions(String contentType) {
Extension[] extensions = this.extensionPoint.getExtensions();
// Look for a preferred plugin.
- List<String> parsePluginList =
- this.parsePluginList.getPluginList(contentType);
- List<Extension> extensionList =
- matchExtensions(parsePluginList, extensions, contentType);
+ List<String> parsePluginList = this.parsePluginList
+ .getPluginList(contentType);
+ List<Extension> extensionList = matchExtensions(parsePluginList,
+ extensions, contentType);
if (extensionList != null) {
return extensionList;
}
@@ -284,20 +289,23 @@ public final class ParserFactory {
/**
* Tries to find a suitable parser for the given contentType.
* <ol>
- * <li>It checks if a parser which accepts the contentType
- * can be found in the <code>plugins</code> list;</li>
- * <li>If this list is empty, it tries to find amongst the loaded
- * extensions whether some of them might suit and warns the user.</li>
+ * <li>It checks if a parser which accepts the contentType can be found in the
+ * <code>plugins</code> list;</li>
+ * <li>If this list is empty, it tries to find amongst the loaded extensions
+ * whether some of them might suit and warns the user.</li>
* </ol>
- * @param plugins List of candidate plugins.
- * @param extensions Array of loaded extensions.
- * @param contentType Content-Type for which we seek a parse plugin.
- * @return List - List of extensions to be used for this contentType.
- * If none, returns null.
+ *
+ * @param plugins
+ * List of candidate plugins.
+ * @param extensions
+ * Array of loaded extensions.
+ * @param contentType
+ * Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType. If none,
+ * returns null.
*/
private List<Extension> matchExtensions(List<String> plugins,
- Extension[] extensions,
- String contentType) {
+ Extension[] extensions, String contentType) {
List<Extension> extList = new ArrayList<Extension>();
if (plugins != null) {
@@ -315,7 +323,7 @@ public final class ParserFactory {
// in either case, LOG the appropriate error message to WARN level
if (ext == null) {
- //try to get it just by its pluginId
+ // try to get it just by its pluginId
ext = getExtension(extensions, parsePluginId);
if (LOG.isWarnEnabled()) {
@@ -323,17 +331,17 @@ public final class ParserFactory {
// plugin was enabled via plugin.includes
// its plugin.xml just doesn't claim to support that
// particular mimeType
- LOG.warn("ParserFactory:Plugin: " + parsePluginId +
- " mapped to contentType " + contentType +
- " via parse-plugins.xml, but " + "its plugin.xml " +
- "file does not claim to support contentType: " +
- contentType);
+ LOG.warn("ParserFactory:Plugin: " + parsePluginId
+ + " mapped to contentType " + contentType
+ + " via parse-plugins.xml, but " + "its plugin.xml "
+ + "file does not claim to support contentType: "
+ + contentType);
} else {
// plugin wasn't enabled via plugin.includes
- LOG.warn("ParserFactory: Plugin: " + parsePluginId +
- " mapped to contentType " + contentType +
- " via parse-plugins.xml, but not enabled via " +
- "plugin.includes in nutch-default.xml");
+ LOG.warn("ParserFactory: Plugin: " + parsePluginId
+ + " mapped to contentType " + contentType
+ + " via parse-plugins.xml, but not enabled via "
+ + "plugin.includes in nutch-default.xml");
}
}
}
@@ -353,12 +361,12 @@ public final class ParserFactory {
// any extensions where this is the case, throw a
// NotMappedParserException
- for (int i=0; i<extensions.length; i++) {
- if ("*".equals(extensions[i].getAttribute("contentType"))){
+ for (int i = 0; i < extensions.length; i++) {
+ if ("*".equals(extensions[i].getAttribute("contentType"))) {
extList.add(0, extensions[i]);
- }
- else if (extensions[i].getAttribute("contentType") != null
- && contentType.matches(escapeContentType(extensions[i].getAttribute("contentType")))) {
+ } else if (extensions[i].getAttribute("contentType") != null
+ && contentType.matches(escapeContentType(extensions[i]
+ .getAttribute("contentType")))) {
extList.add(extensions[i]);
}
}
@@ -367,21 +375,23 @@ public final class ParserFactory {
if (LOG.isInfoEnabled()) {
StringBuffer extensionsIDs = new StringBuffer("[");
boolean isFirst = true;
- for (Extension ext : extList){
- if (!isFirst) extensionsIDs.append(" - ");
- else isFirst=false;
- extensionsIDs.append(ext.getId());
+ for (Extension ext : extList) {
+ if (!isFirst)
+ extensionsIDs.append(" - ");
+ else
+ isFirst = false;
+ extensionsIDs.append(ext.getId());
}
- extensionsIDs.append("]");
- LOG.info("The parsing plugins: " + extensionsIDs.toString() +
- " are enabled via the plugin.includes system " +
- "property, and all claim to support the content type " +
- contentType + ", but they are not mapped to it in the " +
- "parse-plugins.xml file");
+ extensionsIDs.append("]");
+ LOG.info("The parsing plugins: " + extensionsIDs.toString()
+ + " are enabled via the plugin.includes system "
+ + "property, and all claim to support the content type "
+ + contentType + ", but they are not mapped to it in the "
+ + "parse-plugins.xml file");
}
} else if (LOG.isDebugEnabled()) {
- LOG.debug("ParserFactory:No parse plugins mapped or enabled for " +
- "contentType " + contentType);
+ LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+ + "contentType " + contentType);
}
}
@@ -389,23 +399,22 @@ public final class ParserFactory {
}
private String escapeContentType(String contentType) {
- // Escapes contentType in order to use as a regex
- // (and keep backwards compatibility).
- // This enables to accept multiple types for a single parser.
- return contentType.replace("+", "\\+").replace(".", "\\.");
- }
-
+ // Escapes contentType in order to use as a regex
+ // (and keep backwards compatibility).
+ // This enables to accept multiple types for a single parser.
+ return contentType.replace("+", "\\+").replace(".", "\\.");
+ }
- private boolean match(Extension extension, String id, String type) {
- return (id.equals(extension.getId())) &&
- (extension.getAttribute("contentType").equals("*") ||
- type.matches(escapeContentType(extension.getAttribute("contentType"))) ||
- type.equals(DEFAULT_PLUGIN));
+ private boolean match(Extension extension, String id, String type) {
+ return (id.equals(extension.getId()))
+ && (extension.getAttribute("contentType").equals("*")
+ || type.matches(escapeContentType(extension
+ .getAttribute("contentType"))) || type.equals(DEFAULT_PLUGIN));
}
/** Get an extension from its id and supported content-type. */
private Extension getExtension(Extension[] list, String id, String type) {
- for (int i=0; i<list.length; i++) {
+ for (int i = 0; i < list.length; i++) {
if (match(list[i], id, type)) {
return list[i];
}
@@ -414,7 +423,7 @@ public final class ParserFactory {
}
private Extension getExtension(Extension[] list, String id) {
- for (int i=0; i<list.length; i++) {
+ for (int i = 0; i < list.length; i++) {
if (id.equals(list[i].getId())) {
return list[i];
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Fri Jan 9 06:34:33 2015
@@ -58,9 +58,9 @@ public class ParserJob extends NutchTool
private static final String RESUME_KEY = "parse.job.resume";
private static final String FORCE_KEY = "parse.job.force";
-
+
public static final String SKIP_TRUNCATED = "parser.skip.truncated";
-
+
private static final Utf8 REPARSE = new Utf8("-reparse");
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -79,9 +79,8 @@ public class ParserJob extends NutchTool
FIELDS.add(WebPage.Field.HEADERS);
}
-
- public static class ParserMapper
- extends GoraMapper<String, WebPage, String, WebPage> {
+ public static class ParserMapper extends
+ GoraMapper<String, WebPage, String, WebPage> {
private ParseUtil parseUtil;
private boolean shouldResume;
@@ -91,15 +90,16 @@ public class ParserJob extends NutchTool
private Utf8 batchId;
private boolean skipTruncated;
-
+
@Override
public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
parseUtil = new ParseUtil(conf);
shouldResume = conf.getBoolean(RESUME_KEY, false);
force = conf.getBoolean(FORCE_KEY, false);
- batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
- skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
+ batchId = new Utf8(
+ conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+ skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
}
@Override
@@ -131,7 +131,6 @@ public class ParserJob extends NutchTool
if (skipTruncated && isTruncated(unreverseKey, page)) {
return;
}
-
parseUtil.process(key, page);
ParseStatus pstatus = page.getParseStatus();
@@ -141,9 +140,9 @@ public class ParserJob extends NutchTool
}
context.write(key, page);
- }
+ }
}
-
+
public ParserJob() {
}
@@ -151,20 +150,22 @@ public class ParserJob extends NutchTool
public ParserJob(Configuration conf) {
setConf(conf);
}
-
+
/**
* Checks if the page's content is truncated.
- * @param url
+ *
+ * @param url
* @param page
- * @return If the page is truncated <code>true</code>. When it is not,
- * or when it could be determined, <code>false</code>.
+ * @return If the page is truncated <code>true</code>. When it is not, or when
+ * it could be determined, <code>false</code>.
*/
public static boolean isTruncated(String url, WebPage page) {
ByteBuffer content = page.getContent();
if (content == null) {
return false;
}
- CharSequence lengthUtf8 = page.getHeaders().get(new Utf8(HttpHeaders.CONTENT_LENGTH));
+ CharSequence lengthUtf8 = page.getHeaders().get(
+ new Utf8(HttpHeaders.CONTENT_LENGTH));
if (lengthUtf8 == null) {
return false;
}
@@ -186,7 +187,8 @@ public class ParserJob extends NutchTool
return true;
}
if (LOG.isDebugEnabled()) {
- LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
+ LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+ + inHeaderSize);
}
return false;
}
@@ -198,8 +200,8 @@ public class ParserJob extends NutchTool
ParseFilters parseFilters = new ParseFilters(conf);
Collection<WebPage.Field> parsePluginFields = parserFactory.getFields();
- Collection<WebPage.Field> signaturePluginFields =
- SignatureFactory.getFields(conf);
+ Collection<WebPage.Field> signaturePluginFields = SignatureFactory
+ .getFields(conf);
Collection<WebPage.Field> htmlParsePluginFields = parseFilters.getFields();
if (parsePluginFields != null) {
@@ -226,11 +228,11 @@ public class ParserJob extends NutchTool
}
@Override
- public Map<String,Object> run(Map<String,Object> args) throws Exception {
- String batchId = (String)args.get(Nutch.ARG_BATCH);
- Boolean shouldResume = (Boolean)args.get(Nutch.ARG_RESUME);
- Boolean force = (Boolean)args.get(Nutch.ARG_FORCE);
-
+ public Map<String, Object> run(Map<String, Object> args) throws Exception {
+ String batchId = (String) args.get(Nutch.ARG_BATCH);
+ Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME);
+ Boolean force = (Boolean) args.get(Nutch.ARG_FORCE);
+
if (batchId != null) {
getConf().set(GeneratorJob.BATCH_ID, batchId);
}
@@ -241,17 +243,18 @@ public class ParserJob extends NutchTool
getConf().setBoolean(FORCE_KEY, force);
}
LOG.info("ParserJob: resuming:\t" + getConf().getBoolean(RESUME_KEY, false));
- LOG.info("ParserJob: forced reparse:\t" + getConf().getBoolean(FORCE_KEY, false));
+ LOG.info("ParserJob: forced reparse:\t"
+ + getConf().getBoolean(FORCE_KEY, false));
if (batchId == null || batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
LOG.info("ParserJob: parsing all");
} else {
LOG.info("ParserJob: batchId:\t" + batchId);
}
currentJob = new NutchJob(getConf(), "parse");
-
+
Collection<WebPage.Field> fields = getFields(currentJob);
MapFieldValueFilter<String, WebPage> batchIdFilter = getBatchIdFilter(batchId);
- StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class,
+ StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class,
ParserMapper.class, batchIdFilter);
StorageUtils.initReducerJob(currentJob, IdentityPageReducer.class);
currentJob.setNumReduceTasks(0);
@@ -275,20 +278,20 @@ public class ParserJob extends NutchTool
return filter;
}
- public int parse(String batchId, boolean shouldResume, boolean force) throws Exception {
-
+ public int parse(String batchId, boolean shouldResume, boolean force)
+ throws Exception {
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("ParserJob: starting at " + sdf.format(start));
- run(ToolUtil.toArgMap(
- Nutch.ARG_BATCH, batchId,
- Nutch.ARG_RESUME, shouldResume,
- Nutch.ARG_FORCE, force));
+ run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_RESUME,
+ shouldResume, Nutch.ARG_FORCE, force));
LOG.info("ParserJob: success");
-
+
long finish = System.currentTimeMillis();
- LOG.info("ParserJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
+ LOG.info("ParserJob: finished at " + sdf.format(finish)
+ + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
return 0;
}
@@ -298,12 +301,18 @@ public class ParserJob extends NutchTool
String batchId = null;
if (args.length < 1) {
- System.err.println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]");
- System.err.println(" <batchId> - symbolic batch ID created by Generator");
- System.err.println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)");
- System.err.println(" -all - consider pages from all crawl jobs");
- System.err.println(" -resume - resume a previous incomplete job");
- System.err.println(" -force - force re-parsing even if a page is already parsed");
+ System.err
+ .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]");
+ System.err
+ .println(" <batchId> - symbolic batch ID created by Generator");
+ System.err
+ .println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)");
+ System.err
+ .println(" -all - consider pages from all crawl jobs");
+ System.err
+ .println(" -resume - resume a previous incomplete job");
+ System.err
+ .println(" -force - force re-parsing even if a page is already parsed");
return -1;
}
for (int i = 0; i < args.length; i++) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java Fri Jan 9 06:34:33 2015
@@ -18,17 +18,17 @@ package org.apache.nutch.parse;
public class ParserNotFound extends ParseException {
- private static final long serialVersionUID=23993993939L;
+ private static final long serialVersionUID = 23993993939L;
private String url;
private String contentType;
- public ParserNotFound(String message){
- super(message);
+ public ParserNotFound(String message) {
+ super(message);
}
-
+
public ParserNotFound(String url, String contentType) {
- this(url, contentType,
- "parser not found for contentType="+contentType+" url="+url);
+ this(url, contentType, "parser not found for contentType=" + contentType
+ + " url=" + url);
}
public ParserNotFound(String url, String contentType, String message) {
@@ -37,6 +37,11 @@ public class ParserNotFound extends Pars
this.contentType = contentType;
}
- public String getUrl() { return url; }
- public String getContentType() { return contentType; }
+ public String getUrl() {
+ return url;
+ }
+
+ public String getContentType() {
+ return contentType;
+ }
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java Fri Jan 9 06:34:33 2015
@@ -19,3 +19,4 @@
* The {@link org.apache.nutch.parse.Parse Parse} interface and related classes.
*/
package org.apache.nutch.parse;
+
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java Fri Jan 9 06:34:33 2015
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.plugin;
-
/**
* <code>CircularDependencyException</code> will be thrown if a circular
* dependency is detected.
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java Fri Jan 9 06:34:33 2015
@@ -94,8 +94,10 @@ public class Extension {
* Adds a attribute and is only used until model creation at plugin system
* start up.
*
- * @param pKey a key
- * @param pValue a value
+ * @param pKey
+ * a key
+ * @param pValue
+ * a value
*/
public void addAttribute(String pKey, String pValue) {
fAttributes.put(pKey, pValue);
@@ -105,7 +107,8 @@ public class Extension {
* Sets the Class that implement the concret extension and is only used until
* model creation at system start up.
*
- * @param extensionClazz The extensionClasname to set
+ * @param extensionClazz
+ * The extensionClasname to set
*/
public void setClazz(String extensionClazz) {
fClazz = extensionClazz;
@@ -115,7 +118,8 @@ public class Extension {
* Sets the unique extension Id and is only used until model creation at
* system start up.
*
- * @param extensionID The extensionID to set
+ * @param extensionID
+ * The extensionID to set
*/
public void setId(String extensionID) {
fId = extensionID;
@@ -147,10 +151,10 @@ public class Extension {
// The same is in PluginRepository.getPluginInstance().
// Suggested by Stefan Groschupf <sg...@media-style.com>
synchronized (getId()) {
- try {
+ try {
PluginRepository pluginRepository = PluginRepository.get(conf);
- Class extensionClazz =
- pluginRepository.getCachedClass(fDescriptor, getClazz());
+ Class extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+ getClazz());
// lazy loading of Plugin in case there is no instance of the plugin
// already.
pluginRepository.getPluginInstance(getDescriptor());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java Fri Jan 9 06:34:33 2015
@@ -15,6 +15,7 @@
* limitations under the License.
*/
package org.apache.nutch.plugin;
+
import java.util.ArrayList;
/**
@@ -76,7 +77,8 @@ public class ExtensionPoint {
/**
* Sets the extensionPointId.
*
- * @param pId extension point id
+ * @param pId
+ * extension point id
*/
private void setId(String pId) {
ftId = pId;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java Fri Jan 9 06:34:33 2015
@@ -17,8 +17,8 @@
package org.apache.nutch.plugin;
/**
- * <code>MissingDependencyException</code> will be thrown if a plugin
- * dependency cannot be found.
+ * <code>MissingDependencyException</code> will be thrown if a plugin dependency
+ * cannot be found.
*
* @author Jérôme Charron
*/
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java Fri Jan 9 06:34:33 2015
@@ -17,15 +17,14 @@
package org.apache.nutch.plugin;
/**
- * Defines the capability of a class to be plugged into Nutch.
- * This is a common interface that must be implemented by all
- * Nutch Extension Points.
- *
+ * Defines the capability of a class to be plugged into Nutch. This is a common
+ * interface that must be implemented by all Nutch Extension Points.
+ *
* @author Jérôme Charron
- *
+ *
* @see <a href="http://wiki.apache.org/nutch/AboutPlugins">About Plugins</a>
- * @see <a href="package-summary.html#package_description">
- * plugin package description</a>
+ * @see <a href="package-summary.html#package_description"> plugin package
+ * description</a>
*/
public interface Pluggable {
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java Fri Jan 9 06:34:33 2015
@@ -33,8 +33,8 @@ import org.apache.hadoop.conf.Configurat
* The <code>Plugin</code> will be startuped and shutdown by the nutch plugin
* management system.
*
- * A possible usecase of the <code>Plugin</code> implementation is to create
- * or close a database connection.
+ * A possible usecase of the <code>Plugin</code> implementation is to create or
+ * close a database connection.
*
* @author joa23
*/
@@ -81,7 +81,8 @@ public class Plugin {
}
/**
- * @param descriptor The descriptor to set
+ * @param descriptor
+ * The descriptor to set
*/
private void setDescriptor(PluginDescriptor descriptor) {
fDescriptor = descriptor;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java Fri Jan 9 06:34:33 2015
@@ -45,11 +45,11 @@ public class PluginClassLoader extends U
*/
public PluginClassLoader(URL[] urls, ClassLoader parent) {
super(urls, parent);
-
+
this.urls = urls;
this.parent = parent;
}
-
+
@Override
public int hashCode() {
final int PRIME = 31;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java Fri Jan 9 06:34:33 2015
@@ -30,12 +30,11 @@ import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
/**
- * The <code>PluginDescriptor</code> provide access to all meta information of
- * a nutch-plugin, as well to the internationalizable resources and the plugin
- * own classloader. There are meta information about <code>Plugin</code>,
- * <code>ExtensionPoint</code> and <code>Extension</code>. To provide
- * access to the meta data of a plugin via a descriptor allow a lazy loading
- * mechanism.
+ * The <code>PluginDescriptor</code> provide access to all meta information of a
+ * nutch-plugin, as well to the internationalizable resources and the plugin own
+ * classloader. There are meta information about <code>Plugin</code>,
+ * <code>ExtensionPoint</code> and <code>Extension</code>. To provide access to
+ * the meta data of a plugin via a descriptor allow a lazy loading mechanism.
*/
public class PluginDescriptor {
private String fPluginPath;
@@ -51,7 +50,8 @@ public class PluginDescriptor {
private ArrayList<URL> fNotExportedLibs = new ArrayList<URL>();
private ArrayList<Extension> fExtensions = new ArrayList<Extension>();
private PluginClassLoader fClassLoader;
- public static final Logger LOG = LoggerFactory.getLogger(PluginDescriptor.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(PluginDescriptor.class);
private Configuration fConf;
/**
@@ -204,7 +204,8 @@ public class PluginDescriptor {
/**
* Adds a dependency
*
- * @param pId id of the dependent plugin
+ * @param pId
+ * id of the dependent plugin
*/
public void addDependency(String pId) {
fDependencies.add(pId);
@@ -217,7 +218,8 @@ public class PluginDescriptor {
*/
public void addExportedLibRelative(String pLibPath)
throws MalformedURLException {
- URL url = new File(getPluginPath() + File.separator + pLibPath).toURI().toURL();
+ URL url = new File(getPluginPath() + File.separator + pLibPath).toURI()
+ .toURL();
fExportedLibs.add(url);
}
@@ -246,7 +248,8 @@ public class PluginDescriptor {
*/
public void addNotExportedLibRelative(String pLibPath)
throws MalformedURLException {
- URL url = new File(getPluginPath() + File.separator + pLibPath).toURI().toURL();
+ URL url = new File(getPluginPath() + File.separator + pLibPath).toURI()
+ .toURL();
fNotExportedLibs.add(url);
}
@@ -283,8 +286,8 @@ public class PluginDescriptor {
LOG.debug(getPluginId() + " " + e.toString());
}
URL[] urls = arrayList.toArray(new URL[arrayList.size()]);
- fClassLoader = new PluginClassLoader(urls, PluginDescriptor.class
- .getClassLoader());
+ fClassLoader = new PluginClassLoader(urls,
+ PluginDescriptor.class.getClassLoader());
return fClassLoader;
}
@@ -306,7 +309,7 @@ public class PluginDescriptor {
for (String id : pDescriptor.getDependencies()) {
PluginDescriptor descriptor = PluginRepository.get(fConf)
.getPluginDescriptor(id);
- for (URL url: descriptor.getExportedLibUrls()) {
+ for (URL url : descriptor.getExportedLibUrls()) {
pLibs.add(url);
}
collectLibs(pLibs, descriptor);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java Fri Jan 9 06:34:33 2015
@@ -39,8 +39,8 @@ import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
- * The <code>PluginManifestParser</code> parser just parse the manifest file
- * in all plugin directories.
+ * The <code>PluginManifestParser</code> parser just parse the manifest file in
+ * all plugin directories.
*
* @author joa23
*/
@@ -93,7 +93,8 @@ public class PluginManifestParser {
PluginDescriptor p = parseManifestFile(manifestPath);
map.put(p.getPluginId(), p);
} catch (Exception e) {
- LOG.warn("Error while loading plugin `" + manifestPath + "` " + e.toString());
+ LOG.warn("Error while loading plugin `" + manifestPath + "` "
+ + e.toString());
}
}
}
@@ -182,7 +183,7 @@ public class PluginManifestParser {
PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name,
providerName, pluginClazz, pPath, this.conf);
LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version
- + " provider=" + providerName + "class=" + pluginClazz);
+ + " provider=" + providerName + "class=" + pluginClazz);
parseExtension(rootElement, pluginDescriptor);
parseExtensionPoints(rootElement, pluginDescriptor);
parseLibraries(rootElement, pluginDescriptor);
@@ -289,8 +290,8 @@ public class PluginManifestParser {
if (parameters != null) {
for (int k = 0; k < parameters.getLength(); k++) {
Element param = (Element) parameters.item(k);
- extension.addAttribute(param.getAttribute(ATTR_NAME), param
- .getAttribute("value"));
+ extension.addAttribute(param.getAttribute(ATTR_NAME),
+ param.getAttribute("value"));
}
}
pPluginDescriptor.addExtension(extension);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java Fri Jan 9 06:34:33 2015
@@ -50,13 +50,13 @@ public class PluginRepository {
private HashMap<String, ExtensionPoint> fExtensionPoints;
private HashMap<String, Plugin> fActivatedPlugins;
-
- private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE =
- new HashMap<String, Map<PluginClassLoader,Class>>();
+
+ private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<String, Map<PluginClassLoader, Class>>();
private Configuration conf;
- public static final Logger LOG = LoggerFactory.getLogger(PluginRepository.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(PluginRepository.class);
/**
* @throws PluginRuntimeException
@@ -68,7 +68,8 @@ public class PluginRepository {
this.conf = new Configuration(conf);
this.auto = conf.getBoolean("plugin.auto-activation", true);
String[] pluginFolders = conf.getStrings("plugin.folders");
- PluginManifestParser manifestParser = new PluginManifestParser(this.conf, this);
+ PluginManifestParser manifestParser = new PluginManifestParser(this.conf,
+ this);
Map<String, PluginDescriptor> allPlugins = manifestParser
.parsePluginFolder(pluginFolders);
if (allPlugins.isEmpty()) {
@@ -85,7 +86,7 @@ public class PluginRepository {
try {
installExtensions(fRegisteredPlugins);
} catch (PluginRuntimeException e) {
- LOG.error(e.toString());
+ LOG.error(e.toString());
throw new RuntimeException(e.getMessage());
}
displayStatus();
@@ -112,8 +113,8 @@ public class PluginRepository {
return;
}
- for (PluginDescriptor plugin: plugins) {
- for(ExtensionPoint point:plugin.getExtenstionPoints()) {
+ for (PluginDescriptor plugin : plugins) {
+ for (ExtensionPoint point : plugin.getExtenstionPoints()) {
String xpId = point.getId();
LOG.debug("Adding extension point " + xpId);
fExtensionPoints.put(xpId, point);
@@ -128,7 +129,7 @@ public class PluginRepository {
throws PluginRuntimeException {
for (PluginDescriptor descriptor : pRegisteredPlugins) {
- for(Extension extension:descriptor.getExtensions()) {
+ for (Extension extension : descriptor.getExtensions()) {
String xpId = extension.getTargetPoint();
ExtensionPoint point = getExtensionPoint(xpId);
if (point == null) {
@@ -156,7 +157,7 @@ public class PluginRepository {
branch.put(plugin.getPluginId(), plugin);
// Otherwise, checks each dependency
- for(String id:plugin.getDependencies()) {
+ for (String id : plugin.getDependencies()) {
PluginDescriptor dependency = plugins.get(id);
if (dependency == null) {
throw new MissingDependencyException("Missing dependency " + id
@@ -271,7 +272,8 @@ public class PluginRepository {
// The same is in Extension.getExtensionInstance().
// Suggested by Stefan Groschupf <sg...@media-style.com>
synchronized (pDescriptor) {
- Class<?> pluginClass = getCachedClass(pDescriptor, pDescriptor.getPluginClass());
+ Class<?> pluginClass = getCachedClass(pDescriptor,
+ pDescriptor.getPluginClass());
Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] {
PluginDescriptor.class, Configuration.class });
Plugin plugin = (Plugin) constructor.newInstance(new Object[] {
@@ -312,9 +314,9 @@ public class PluginRepository {
plugin.shutDown();
}
}
-
+
public Class getCachedClass(PluginDescriptor pDescriptor, String className)
- throws ClassNotFoundException {
+ throws ClassNotFoundException {
Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className);
if (descMap == null) {
descMap = new HashMap<PluginClassLoader, Class>();
Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java Fri Jan 9 06:34:33 2015
@@ -16,6 +16,7 @@
* limitations under the License.
*/
package org.apache.nutch.plugin;
+
/**
* <code>PluginRuntimeException</code> will be thrown until a exception in the
* plugin managemnt occurs.
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java Fri Jan 9 06:34:33 2015
@@ -41,7 +41,7 @@ import org.apache.nutch.metadata.Metadat
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
-public final class Content implements Writable{
+public final class Content implements Writable {
public static final String DIR_NAME = "content";
@@ -85,7 +85,7 @@ public final class Content implements Wr
this.mimeTypes = new MimeUtil(conf);
this.contentType = getContentType(contentType, url, content);
}
-
+
public Content(String url, String base, byte[] content, String contentType,
Metadata metadata, MimeUtil mimeTypes) {
@@ -141,11 +141,11 @@ public final class Content implements Wr
metadata.readFields(in); // read meta data
break;
default:
- throw new VersionMismatchException((byte)2, oldVersion);
+ throw new VersionMismatchException((byte) 2, oldVersion);
}
}
-
+
public final void readFields(DataInput in) throws IOException {
metadata.clear();
int sizeOrVersion = in.readInt();
@@ -163,14 +163,14 @@ public final class Content implements Wr
metadata.readFields(in);
break;
default:
- throw new VersionMismatchException((byte)VERSION, (byte)version);
+ throw new VersionMismatchException((byte) VERSION, (byte) version);
}
} else { // size
byte[] compressed = new byte[sizeOrVersion];
in.readFully(compressed, 0, compressed.length);
ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
- DataInput inflater =
- new DataInputStream(new InflaterInputStream(deflated));
+ DataInput inflater = new DataInputStream(
+ new InflaterInputStream(deflated));
readFieldsCompressed(inflater);
}
}
@@ -204,8 +204,9 @@ public final class Content implements Wr
return url;
}
- /** The base url for relative links contained in the content.
- * Maybe be different from url if the request redirected.
+ /**
+ * The base url for relative links contained in the content. Maybe be
+ * different from url if the request redirected.
*/
public String getBaseUrl() {
return base;
@@ -220,7 +221,9 @@ public final class Content implements Wr
this.content = content;
}
- /** The media type of the retrieved content.
+ /**
+ * The media type of the retrieved content.
+ *
* @see <a href="http://www.iana.org/assignments/media-types/">
* http://www.iana.org/assignments/media-types/</a>
*/
@@ -276,9 +279,9 @@ public final class Content implements Wr
System.out.println("usage:" + usage);
return;
}
-
- GenericOptionsParser optParser =
- new GenericOptionsParser(NutchConfiguration.create(), args);
+
+ GenericOptionsParser optParser = new GenericOptionsParser(
+ NutchConfiguration.create(), args);
String[] argv = optParser.getRemainingArgs();
Configuration conf = optParser.getConfiguration();
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java Fri Jan 9 06:34:33 2015
@@ -25,7 +25,7 @@ import org.apache.nutch.storage.WebPage;
import crawlercommons.robots.BaseRobotRules;
-/** A retriever of url content. Implemented by protocol extensions. */
+/** A retriever of url content. Implemented by protocol extensions. */
public interface Protocol extends FieldPluggable, Configurable {
/** The name of the extension point. */
public final static String X_POINT_ID = Protocol.class.getName();
@@ -55,7 +55,9 @@ public interface Protocol extends FieldP
/**
* Retrieve robot rules applicable for this url.
- * @param url url to check
+ *
+ * @param url
+ * url to check
* @param page
* @return robot rules (specific for this url or default), never null
*/
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java Fri Jan 9 06:34:33 2015
@@ -22,7 +22,7 @@ public class ProtocolNotFound extends Pr
private String url;
public ProtocolNotFound(String url) {
- this(url, "protocol not found for url="+url);
+ this(url, "protocol not found for url=" + url);
}
public ProtocolNotFound(String url, String message) {
@@ -30,5 +30,7 @@ public class ProtocolNotFound extends Pr
this.url = url;
}
- public String getUrl() { return url; }
+ public String getUrl() {
+ return url;
+ }
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java Fri Jan 9 06:34:33 2015
@@ -17,10 +17,10 @@
package org.apache.nutch.protocol;
-
/**
- * Simple aggregate to pass from protocol plugins both content and
- * protocol status.
+ * Simple aggregate to pass from protocol plugins both content and protocol
+ * status.
+ *
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class ProtocolOutput {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java Fri Jan 9 06:34:33 2015
@@ -19,38 +19,42 @@ package org.apache.nutch.protocol;
public interface ProtocolStatusCodes {
/** Content was retrieved without errors. */
- public static final int SUCCESS = 1;
+ public static final int SUCCESS = 1;
/** Content was not retrieved. Any further errors may be indicated in args. */
- public static final int FAILED = 2;
+ public static final int FAILED = 2;
- /** This protocol was not found. Application may attempt to retry later. */
- public static final int PROTO_NOT_FOUND = 10;
+ /** This protocol was not found. Application may attempt to retry later. */
+ public static final int PROTO_NOT_FOUND = 10;
/** Resource is gone. */
- public static final int GONE = 11;
+ public static final int GONE = 11;
/** Resource has moved permanently. New url should be found in args. */
- public static final int MOVED = 12;
+ public static final int MOVED = 12;
/** Resource has moved temporarily. New url should be found in args. */
- public static final int TEMP_MOVED = 13;
+ public static final int TEMP_MOVED = 13;
/** Resource was not found. */
- public static final int NOTFOUND = 14;
+ public static final int NOTFOUND = 14;
/** Temporary failure. Application may retry immediately. */
- public static final int RETRY = 15;
- /** Unspecified exception occured. Further information may be provided in args. */
- public static final int EXCEPTION = 16;
+ public static final int RETRY = 15;
+ /**
+ * Unspecified exception occured. Further information may be provided in args.
+ */
+ public static final int EXCEPTION = 16;
/** Access denied - authorization required, but missing/incorrect. */
- public static final int ACCESS_DENIED = 17;
+ public static final int ACCESS_DENIED = 17;
/** Access denied by robots.txt rules. */
- public static final int ROBOTS_DENIED = 18;
+ public static final int ROBOTS_DENIED = 18;
/** Too many redirects. */
- public static final int REDIR_EXCEEDED = 19;
+ public static final int REDIR_EXCEEDED = 19;
/** Not fetching. */
- public static final int NOTFETCHING = 20;
+ public static final int NOTFETCHING = 20;
/** Unchanged since the last fetch. */
- public static final int NOTMODIFIED = 21;
- /** Request was refused by protocol plugins, because it would block.
- * The expected number of milliseconds to wait before retry may be provided
- * in args. */
- public static final int WOULDBLOCK = 22;
+ public static final int NOTMODIFIED = 21;
+ /**
+ * Request was refused by protocol plugins, because it would block. The
+ * expected number of milliseconds to wait before retry may be provided in
+ * args.
+ */
+ public static final int WOULDBLOCK = 22;
/** Thread was blocked http.max.delays times during fetching. */
- public static final int BLOCKED = 23;
+ public static final int BLOCKED = 23;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java Fri Jan 9 06:34:33 2015
@@ -100,7 +100,7 @@ public class ProtocolStatusUtils impleme
}
return TableUtil.toString(args.iterator().next());
}
-
+
public static String toString(ProtocolStatus status) {
if (status == null) {
return "(null)";
@@ -113,7 +113,8 @@ public class ProtocolStatusUtils impleme
int i = 0;
Iterator<CharSequence> it = args.iterator();
while (it.hasNext()) {
- if (i > 0) sb.append(',');
+ if (i > 0)
+ sb.append(',');
sb.append(it.next());
i++;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java Fri Jan 9 06:34:33 2015
@@ -35,9 +35,8 @@ public interface RobotRules {
public long getCrawlDelay();
/**
- * Returns <code>false</code> if the <code>robots.txt</code> file
- * prohibits us from accessing the given <code>url</code>, or
- * <code>true</code> otherwise.
+ * Returns <code>false</code> if the <code>robots.txt</code> file prohibits us
+ * from accessing the given <code>url</code>, or <code>true</code> otherwise.
*/
public boolean isAllowed(URL url);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Jan 9 06:34:33 2015
@@ -43,35 +43,38 @@ import crawlercommons.robots.SimpleRobot
import crawlercommons.robots.SimpleRobotRulesParser;
/**
- * This class uses crawler-commons for handling the parsing of {@code robots.txt} files.
- * It emits SimpleRobotRules objects, which describe the download permissions
- * as described in SimpleRobotRulesParser.
+ * This class uses crawler-commons for handling the parsing of
+ * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
+ * the download permissions as described in SimpleRobotRulesParser.
*/
public abstract class RobotRulesParser implements Configurable {
- public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(RobotRulesParser.class);
- protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules> ();
+ protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules>();
/**
- * A {@link BaseRobotRules} object appropriate for use
- * when the {@code robots.txt} file is empty or missing;
- * all requests are allowed.
+ * A {@link BaseRobotRules} object appropriate for use when the
+ * {@code robots.txt} file is empty or missing; all requests are allowed.
*/
- public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+ public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(
+ RobotRulesMode.ALLOW_ALL);
/**
- * A {@link BaseRobotRules} object appropriate for use when the
- * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
- * response; all requests are disallowed.
+ * A {@link BaseRobotRules} object appropriate for use when the
+ * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+ * response; all requests are disallowed.
*/
- public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+ public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(
+ RobotRulesMode.ALLOW_NONE);
private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
private Configuration conf;
protected String agentNames;
- public RobotRulesParser() { }
+ public RobotRulesParser() {
+ }
public RobotRulesParser(Configuration conf) {
setConf(conf);
@@ -90,16 +93,18 @@ public abstract class RobotRulesParser i
}
agentNames = agentName;
- // If there are any other agents specified, append those to the list of agents
+ // If there are any other agents specified, append those to the list of
+ // agents
String otherAgents = conf.get("http.robots.agents");
- if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+ if (otherAgents != null && !otherAgents.trim().isEmpty()) {
StringTokenizer tok = new StringTokenizer(otherAgents, ",");
StringBuilder sb = new StringBuilder(agentNames);
while (tok.hasMoreTokens()) {
String str = tok.nextToken().trim();
if (str.equals("*") || str.equals(agentName)) {
// skip wildcard "*" or agent name itself
- // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718)
+ // (required for backward compatibility, cf. NUTCH-1715 and
+ // NUTCH-1718)
} else {
sb.append(",").append(str);
}
@@ -117,16 +122,23 @@ public abstract class RobotRulesParser i
}
/**
- * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons
- *
- * @param url A string containing url
- * @param content Contents of the robots file in a byte array
- * @param contentType The content type of the robots file
- * @param robotName A string containing all the robots agent names used by parser for matching
- * @return BaseRobotRules object
+ * Parses the robots content using the {@link SimpleRobotRulesParser} from
+ * crawler commons
+ *
+ * @param url
+ * A string containing url
+ * @param content
+ * Contents of the robots file in a byte array
+ * @param contentType
+ * The content type of the robots file
+ * @param robotName
+ * A string containing all the robots agent names used by parser for
+ * matching
+ * @return BaseRobotRules object
*/
- public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
- return robotParser.parseContent(url, content, contentType, robotName);
+ public BaseRobotRules parseRules(String url, byte[] content,
+ String contentType, String robotName) {
+ return robotParser.parseContent(url, content, contentType, robotName);
}
public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
@@ -145,23 +157,29 @@ public abstract class RobotRulesParser i
public static void main(String[] argv) {
if (argv.length != 3) {
- System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
- System.err.println(" <robots-file> - Input robots.txt file which will be parsed.");
- System.err.println(" <url-file> - Contains input URLs (1 per line) which are tested against the rules.");
- System.err.println(" <agent-names> - Input agent names. Multiple agent names can be provided using");
- System.err.println(" comma as a delimiter without any spaces.");
+ System.err
+ .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
+ System.err
+ .println(" <robots-file> - Input robots.txt file which will be parsed.");
+ System.err
+ .println(" <url-file> - Contains input URLs (1 per line) which are tested against the rules.");
+ System.err
+ .println(" <agent-names> - Input agent names. Multiple agent names can be provided using");
+ System.err
+ .println(" comma as a delimiter without any spaces.");
System.exit(-1);
}
try {
byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
- BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
+ "text/plain", argv[2]);
LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
String testPath = testsIn.readLine().trim();
while (testPath != null) {
- System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
- ":\t" + testPath);
+ System.out.println((rules.isAllowed(testPath) ? "allowed"
+ : "not allowed") + ":\t" + testPath);
testPath = testsIn.readLine();
}
testsIn.close();
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java Fri Jan 9 06:34:33 2015
@@ -20,3 +20,4 @@
* see also {@link org.apache.nutch.net.protocols}.
*/
package org.apache.nutch.protocol;
+