You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [12/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ s...
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java Fri Jan 9 06:34:33 2015
@@ -35,27 +35,26 @@ import java.util.List;
/**
* A simple class for detecting character encodings.
- *
+ *
* <p>
* Broadly this encompasses two functions, which are distinctly separate:
- *
+ *
* <ol>
- * <li>Auto detecting a set of "clues" from input text.</li>
- * <li>Taking a set of clues and making a "best guess" as to the
- * "real" encoding.</li>
+ * <li>Auto detecting a set of "clues" from input text.</li>
+ * <li>Taking a set of clues and making a "best guess" as to the "real"
+ * encoding.</li>
* </ol>
* </p>
- *
+ *
* <p>
- * A caller will often have some extra information about what the
- * encoding might be (e.g. from the HTTP header or HTML meta-tags, often
- * wrong but still potentially useful clues). The types of clues may differ
- * from caller to caller. Thus a typical calling sequence is:
+ * A caller will often have some extra information about what the encoding might
+ * be (e.g. from the HTTP header or HTML meta-tags, often wrong but still
+ * potentially useful clues). The types of clues may differ from caller to
+ * caller. Thus a typical calling sequence is:
* <ul>
- * <li>Run step (1) to generate a set of auto-detected clues;</li>
- * <li>Combine these clues with the caller-dependent "extra clues"
- * available;</li>
- * <li>Run step (2) to guess what the most probable answer is.</li>
+ * <li>Run step (1) to generate a set of auto-detected clues;</li>
+ * <li>Combine these clues with the caller-dependent "extra clues" available;</li>
+ * <li>Run step (2) to guess what the most probable answer is.</li>
* </p>
*/
public class EncodingDetector {
@@ -90,34 +89,32 @@ public class EncodingDetector {
@Override
public String toString() {
- return value + " (" + source +
- ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
+ return value + " (" + source
+ + ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
}
public boolean isEmpty() {
- return (value==null || "".equals(value));
+ return (value == null || "".equals(value));
}
public boolean meetsThreshold() {
- return (confidence < 0 ||
- (minConfidence >= 0 && confidence >= minConfidence));
+ return (confidence < 0 || (minConfidence >= 0 && confidence >= minConfidence));
}
}
- public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(EncodingDetector.class);
public static final int NO_THRESHOLD = -1;
- public static final String MIN_CONFIDENCE_KEY =
- "encodingdetector.charset.min.confidence";
+ public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
- private static final HashMap<String, String> ALIASES =
- new HashMap<String, String>();
+ private static final HashMap<String, String> ALIASES = new HashMap<String, String>();
private static final HashSet<String> DETECTABLES = new HashSet<String>();
// CharsetDetector will die without a minimum amount of data.
- private static final int MIN_LENGTH=4;
+ private static final int MIN_LENGTH = 4;
static {
DETECTABLES.add("text/html");
@@ -130,23 +127,22 @@ public class EncodingDetector {
DETECTABLES.add("application/rss+xml");
DETECTABLES.add("application/xhtml+xml");
/*
- * the following map is not an alias mapping table, but
- * maps character encodings which are often used in mislabelled
- * documents to their correct encodings. For instance,
- * there are a lot of documents labelled 'ISO-8859-1' which contain
- * characters not covered by ISO-8859-1 but covered by windows-1252.
- * Because windows-1252 is a superset of ISO-8859-1 (sharing code points
- * for the common part), it's better to treat ISO-8859-1 as
- * synonymous with windows-1252 than to reject, as invalid, documents
- * labelled as ISO-8859-1 that have characters outside ISO-8859-1.
+ * the following map is not an alias mapping table, but maps character
+ * encodings which are often used in mislabelled documents to their correct
+ * encodings. For instance, there are a lot of documents labelled
+ * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
+ * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1
+ * (sharing code points for the common part), it's better to treat
+ * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
+ * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1.
*/
ALIASES.put("ISO-8859-1", "windows-1252");
ALIASES.put("EUC-KR", "x-windows-949");
ALIASES.put("x-EUC-CN", "GB18030");
ALIASES.put("GBK", "GB18030");
- //ALIASES.put("Big5", "Big5HKSCS");
- //ALIASES.put("TIS620", "Cp874");
- //ALIASES.put("ISO-8859-11", "Cp874");
+ // ALIASES.put("Big5", "Big5HKSCS");
+ // ALIASES.put("TIS620", "Cp874");
+ // ALIASES.put("ISO-8859-11", "Cp874");
}
@@ -164,16 +160,16 @@ public class EncodingDetector {
public void autoDetectClues(WebPage page, boolean filter) {
autoDetectClues(page.getContent(), page.getContentType(),
- parseCharacterEncoding(page.getHeaders().get(CONTENT_TYPE_UTF8)), filter);
+ parseCharacterEncoding(page.getHeaders().get(CONTENT_TYPE_UTF8)),
+ filter);
}
private void autoDetectClues(ByteBuffer dataBuffer, CharSequence typeUtf8,
- String encoding, boolean filter) {
+ String encoding, boolean filter) {
int length = dataBuffer.remaining();
String type = TableUtil.toString(typeUtf8);
- if (minConfidence >= 0 && DETECTABLES.contains(type)
- && length > MIN_LENGTH) {
+ if (minConfidence >= 0 && DETECTABLES.contains(type) && length > MIN_LENGTH) {
CharsetMatch[] matches = null;
// do all these in a try/catch; setText and detect/detectAll
@@ -214,12 +210,14 @@ public class EncodingDetector {
/**
* Guess the encoding with the previously specified list of clues.
- *
- * @param row URL's row
- * @param defaultValue Default encoding to return if no encoding can be
- * detected with enough confidence. Note that this will <b>not</b> be
- * normalized with {@link EncodingDetector#resolveEncodingAlias}
- *
+ *
+ * @param row
+ * URL's row
+ * @param defaultValue
+ * Default encoding to return if no encoding can be detected with
+ * enough confidence. Note that this will <b>not</b> be normalized
+ * with {@link EncodingDetector#resolveEncodingAlias}
+ *
* @return Guessed encoding or defaultValue
*/
public String guessEncoding(WebPage page, String defaultValue) {
@@ -230,33 +228,33 @@ public class EncodingDetector {
/**
* Guess the encoding with the previously specified list of clues.
- *
- * @param baseUrl Base URL
- * @param defaultValue Default encoding to return if no encoding can be
- * detected with enough confidence. Note that this will <b>not</b> be
- * normalized with {@link EncodingDetector#resolveEncodingAlias}
- *
+ *
+ * @param baseUrl
+ * Base URL
+ * @param defaultValue
+ * Default encoding to return if no encoding can be detected with
+ * enough confidence. Note that this will <b>not</b> be normalized
+ * with {@link EncodingDetector#resolveEncodingAlias}
+ *
* @return Guessed encoding or defaultValue
*/
private String guessEncoding(String baseUrl, String defaultValue) {
/*
- * This algorithm could be replaced by something more sophisticated;
- * ideally we would gather a bunch of data on where various clues
- * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with
- * the correct answer, and use machine learning/some statistical method
- * to generate a better heuristic.
+ * This algorithm could be replaced by something more sophisticated; ideally
+ * we would gather a bunch of data on where various clues (autodetect, HTTP
+ * headers, HTML meta tags, etc.) disagree, tag each with the correct
+ * answer, and use machine learning/some statistical method to generate a
+ * better heuristic.
*/
-
if (LOG.isTraceEnabled()) {
findDisagreements(baseUrl, clues);
}
/*
- * Go down the list of encoding "clues". Use a clue if:
- * 1. Has a confidence value which meets our confidence threshold, OR
- * 2. Doesn't meet the threshold, but is the best try,
- * since nothing else is available.
+ * Go down the list of encoding "clues". Use a clue if: 1. Has a confidence
+ * value which meets our confidence threshold, OR 2. Doesn't meet the
+ * threshold, but is the best try, since nothing else is available.
*/
EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
EncodingClue bestClue = defaultClue;
@@ -268,8 +266,8 @@ public class EncodingDetector {
String charset = clue.value;
if (minConfidence >= 0 && clue.confidence >= minConfidence) {
if (LOG.isTraceEnabled()) {
- LOG.trace(baseUrl + ": Choosing encoding: " + charset +
- " with confidence " + clue.confidence);
+ LOG.trace(baseUrl + ": Choosing encoding: " + charset
+ + " with confidence " + clue.confidence);
}
return resolveEncodingAlias(charset).toLowerCase();
} else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
@@ -289,10 +287,10 @@ public class EncodingDetector {
}
/*
- * Strictly for analysis, look for "disagreements." The top guess from
- * each source is examined; if these meet the threshold and disagree, then
- * we log the information -- useful for testing or generating training data
- * for a better heuristic.
+ * Strictly for analysis, look for "disagreements." The top guess from each
+ * source is examined; if these meet the threshold and disagree, then we log
+ * the information -- useful for testing or generating training data for a
+ * better heuristic.
*/
private void findDisagreements(String url, List<EncodingClue> newClues) {
HashSet<String> valsSeen = new HashSet<String>();
@@ -314,9 +312,9 @@ public class EncodingDetector {
if (disagreement) {
// dump all values in case of disagreement
StringBuffer sb = new StringBuffer();
- sb.append("Disagreement: "+url+"; ");
+ sb.append("Disagreement: " + url + "; ");
for (int i = 0; i < newClues.size(); i++) {
- if (i>0) {
+ if (i > 0) {
sb.append(", ");
}
sb.append(newClues.get(i));
@@ -331,7 +329,7 @@ public class EncodingDetector {
return null;
String canonicalName = new String(Charset.forName(encoding).name());
return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName)
- : canonicalName;
+ : canonicalName;
} catch (Exception e) {
LOG.warn("Invalid encoding " + encoding + " detected, using default.");
return null;
@@ -339,13 +337,12 @@ public class EncodingDetector {
}
/**
- * Parse the character encoding from the specified content type header.
- * If the content type is null, or there is no explicit character encoding,
- * <code>null</code> is returned.
- * <br />
- * This method was copied from org.apache.catalina.util.RequestUtil,
- * which is licensed under the Apache License, Version 2.0 (the "License").
- *
+ * Parse the character encoding from the specified content type header. If the
+ * content type is null, or there is no explicit character encoding,
+ * <code>null</code> is returned. <br />
+ * This method was copied from org.apache.catalina.util.RequestUtil, which is
+ * licensed under the Apache License, Version 2.0 (the "License").
+ *
* @param contentTypeUtf8
*/
public static String parseCharacterEncoding(CharSequence contentTypeUtf8) {
@@ -361,51 +358,36 @@ public class EncodingDetector {
encoding = encoding.substring(0, end);
encoding = encoding.trim();
if ((encoding.length() > 2) && (encoding.startsWith("\""))
- && (encoding.endsWith("\"")))
+ && (encoding.endsWith("\"")))
encoding = encoding.substring(1, encoding.length() - 1);
return (encoding.trim());
}
- /*public static void main(String[] args) throws IOException {
- if (args.length != 1) {
- System.err.println("Usage: EncodingDetector <file>");
- System.exit(1);
- }
-
- Configuration conf = NutchConfiguration.create();
- EncodingDetector detector =
- new EncodingDetector(NutchConfiguration.create());
-
- // do everything as bytes; don't want any conversion
- BufferedInputStream istr =
- new BufferedInputStream(new FileInputStream(args[0]));
- ByteArrayOutputStream ostr = new ByteArrayOutputStream();
- byte[] bytes = new byte[1000];
- boolean more = true;
- while (more) {
- int len = istr.read(bytes);
- if (len < bytes.length) {
- more = false;
- if (len > 0) {
- ostr.write(bytes, 0, len);
- }
- } else {
- ostr.write(bytes);
- }
- }
-
- byte[] data = ostr.toByteArray();
- MimeUtil mimeTypes = new MimeUtil(conf);
-
- // make a fake Content
- Content content =
- new Content("", "", data, "text/html", new Metadata(), mimeTypes);
-
- detector.autoDetectClues(content, true);
- String encoding = detector.guessEncoding(content,
- conf.get("parser.character.encoding.default"));
- System.out.println("Guessed encoding: " + encoding);
- }*/
+ /*
+ * public static void main(String[] args) throws IOException { if (args.length
+ * != 1) { System.err.println("Usage: EncodingDetector <file>");
+ * System.exit(1); }
+ *
+ * Configuration conf = NutchConfiguration.create(); EncodingDetector detector
+ * = new EncodingDetector(NutchConfiguration.create());
+ *
+ * // do everything as bytes; don't want any conversion BufferedInputStream
+ * istr = new BufferedInputStream(new FileInputStream(args[0]));
+ * ByteArrayOutputStream ostr = new ByteArrayOutputStream(); byte[] bytes =
+ * new byte[1000]; boolean more = true; while (more) { int len =
+ * istr.read(bytes); if (len < bytes.length) { more = false; if (len > 0) {
+ * ostr.write(bytes, 0, len); } } else { ostr.write(bytes); } }
+ *
+ * byte[] data = ostr.toByteArray(); MimeUtil mimeTypes = new MimeUtil(conf);
+ *
+ * // make a fake Content Content content = new Content("", "", data,
+ * "text/html", new Metadata(), mimeTypes);
+ *
+ * detector.autoDetectClues(content, true); String encoding =
+ * detector.guessEncoding(content,
+ * conf.get("parser.character.encoding.default"));
+ * System.out.println("Guessed encoding: " + encoding); }
+ */
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/FSUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/FSUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/FSUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/FSUtils.java Fri Jan 9 06:34:33 2015
@@ -33,16 +33,20 @@ public class FSUtils {
* path. If removeOld is set to false then the old path will be set to the
* name current.old.
*
- * @param fs The FileSystem.
- * @param current The end path, the one being replaced.
- * @param replacement The path to replace with.
- * @param removeOld True if we are removing the current path.
+ * @param fs
+ * The FileSystem.
+ * @param current
+ * The end path, the one being replaced.
+ * @param replacement
+ * The path to replace with.
+ * @param removeOld
+ * True if we are removing the current path.
*
- * @throws IOException If an error occurs during replacement.
+ * @throws IOException
+ * If an error occurs during replacement.
*/
public static void replace(FileSystem fs, Path current, Path replacement,
- boolean removeOld)
- throws IOException {
+ boolean removeOld) throws IOException {
// rename any current path to old
Path old = new Path(current + ".old");
@@ -60,12 +64,14 @@ public class FSUtils {
/**
* Closes a group of SequenceFile readers.
*
- * @param readers The SequenceFile readers to close.
- * @throws IOException If an error occurs while closing a reader.
+ * @param readers
+ * The SequenceFile readers to close.
+ * @throws IOException
+ * If an error occurs while closing a reader.
*/
public static void closeReaders(SequenceFile.Reader[] readers)
- throws IOException {
-
+ throws IOException {
+
// loop through the readers, closing one by one
if (readers != null) {
for (int i = 0; i < readers.length; i++) {
@@ -80,12 +86,13 @@ public class FSUtils {
/**
* Closes a group of MapFile readers.
*
- * @param readers The MapFile readers to close.
- * @throws IOException If an error occurs while closing a reader.
+ * @param readers
+ * The MapFile readers to close.
+ * @throws IOException
+ * If an error occurs while closing a reader.
*/
- public static void closeReaders(MapFile.Reader[] readers)
- throws IOException {
-
+ public static void closeReaders(MapFile.Reader[] readers) throws IOException {
+
// loop through the readers closing one by one
if (readers != null) {
for (int i = 0; i < readers.length; i++) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/GZIPUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/GZIPUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/GZIPUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/GZIPUtils.java Fri Jan 9 06:34:33 2015
@@ -28,19 +28,18 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * A collection of utility methods for working on GZIPed data.
+ * A collection of utility methods for working on GZIPed data.
*/
public class GZIPUtils {
-
+
private static final Logger LOG = LoggerFactory.getLogger(GZIPUtils.class);
- private static final int EXPECTED_COMPRESSION_RATIO= 5;
- private static final int BUF_SIZE= 4096;
+ private static final int EXPECTED_COMPRESSION_RATIO = 5;
+ private static final int BUF_SIZE = 4096;
/**
- * Returns an gunzipped copy of the input array. If the gzipped
- * input has been truncated or corrupted, a best-effort attempt is
- * made to unzip as much as possible. If no data can be extracted
- * <code>null</code> is returned.
+ * Returns an gunzipped copy of the input array. If the gzipped input has been
+ * truncated or corrupted, a best-effort attempt is made to unzip as much as
+ * possible. If no data can be extracted <code>null</code> is returned.
*/
public static final byte[] unzipBestEffort(byte[] in) {
return unzipBestEffort(in, Integer.MAX_VALUE);
@@ -48,33 +47,32 @@ public class GZIPUtils {
/**
* Returns an gunzipped copy of the input array, truncated to
- * <code>sizeLimit</code> bytes, if necessary. If the gzipped input
- * has been truncated or corrupted, a best-effort attempt is made to
- * unzip as much as possible. If no data can be extracted
- * <code>null</code> is returned.
+ * <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been
+ * truncated or corrupted, a best-effort attempt is made to unzip as much as
+ * possible. If no data can be extracted <code>null</code> is returned.
*/
public static final byte[] unzipBestEffort(byte[] in, int sizeLimit) {
try {
- // decompress using GZIPInputStream
- ByteArrayOutputStream outStream =
- new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+ // decompress using GZIPInputStream
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+ EXPECTED_COMPRESSION_RATIO * in.length);
- GZIPInputStream inStream =
- new GZIPInputStream ( new ByteArrayInputStream(in) );
+ GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(
+ in));
byte[] buf = new byte[BUF_SIZE];
int written = 0;
while (true) {
try {
int size = inStream.read(buf);
- if (size <= 0)
+ if (size <= 0)
break;
if ((written + size) > sizeLimit) {
outStream.write(buf, 0, sizeLimit - written);
break;
}
outStream.write(buf, 0, size);
- written+= size;
+ written += size;
} catch (Exception e) {
break;
}
@@ -91,23 +89,23 @@ public class GZIPUtils {
}
}
-
/**
- * Returns an gunzipped copy of the input array.
- * @throws IOException if the input cannot be properly decompressed
+ * Returns an gunzipped copy of the input array.
+ *
+ * @throws IOException
+ * if the input cannot be properly decompressed
*/
public static final byte[] unzip(byte[] in) throws IOException {
- // decompress using GZIPInputStream
- ByteArrayOutputStream outStream =
- new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+ // decompress using GZIPInputStream
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+ EXPECTED_COMPRESSION_RATIO * in.length);
- GZIPInputStream inStream =
- new GZIPInputStream ( new ByteArrayInputStream(in) );
+ GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(in));
byte[] buf = new byte[BUF_SIZE];
while (true) {
int size = inStream.read(buf);
- if (size <= 0)
+ if (size <= 0)
break;
outStream.write(buf, 0, size);
}
@@ -121,11 +119,11 @@ public class GZIPUtils {
*/
public static final byte[] zip(byte[] in) {
try {
- // compress using GZIPOutputStream
- ByteArrayOutputStream byteOut=
- new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO);
+ // compress using GZIPOutputStream
+ ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
+ / EXPECTED_COMPRESSION_RATIO);
- GZIPOutputStream outStream= new GZIPOutputStream(byteOut);
+ GZIPOutputStream outStream = new GZIPOutputStream(byteOut);
try {
outStream.write(in);
@@ -142,9 +140,9 @@ public class GZIPUtils {
return byteOut.toByteArray();
} catch (IOException e) {
- LOG.error("Failed with IOException", e);
+ LOG.error("Failed with IOException", e);
return null;
}
}
-
+
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Fri Jan 9 06:34:33 2015
@@ -24,12 +24,15 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.Writable;
-/** A generic Writable wrapper that can inject Configuration to {@link Configurable}s */
-public abstract class GenericWritableConfigurable extends GenericWritable
- implements Configurable {
+/**
+ * A generic Writable wrapper that can inject Configuration to
+ * {@link Configurable}s
+ */
+public abstract class GenericWritableConfigurable extends GenericWritable
+ implements Configurable {
private Configuration conf;
-
+
public Configuration getConf() {
return conf;
}
@@ -37,7 +40,7 @@ public abstract class GenericWritableCon
public void setConf(Configuration conf) {
this.conf = conf;
}
-
+
@Override
public void readFields(DataInput in) throws IOException {
byte type = in.readByte();
@@ -50,8 +53,8 @@ public abstract class GenericWritableCon
}
Writable w = get();
if (w instanceof Configurable)
- ((Configurable)w).setConf(conf);
+ ((Configurable) w).setConf(conf);
w.readFields(in);
}
-
+
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java Fri Jan 9 06:34:33 2015
@@ -25,48 +25,48 @@ import org.apache.hadoop.fs.PathFilter;
public class HadoopFSUtil {
- /**
- * Returns PathFilter that passes all paths through.
- */
- public static PathFilter getPassAllFilter() {
- return new PathFilter() {
- public boolean accept(Path arg0) {
- return true;
- }
- };
- }
+ /**
+ * Returns PathFilter that passes all paths through.
+ */
+ public static PathFilter getPassAllFilter() {
+ return new PathFilter() {
+ public boolean accept(Path arg0) {
+ return true;
+ }
+ };
+ }
+
+ /**
+ * Returns PathFilter that passes directories through.
+ */
+ public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
+ return new PathFilter() {
+ public boolean accept(final Path path) {
+ try {
+ return fs.getFileStatus(path).isDir();
+ } catch (IOException ioe) {
+ return false;
+ }
+ }
- /**
- * Returns PathFilter that passes directories through.
- */
- public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
- return new PathFilter() {
- public boolean accept(final Path path) {
- try {
- return fs.getFileStatus(path).isDir();
- } catch (IOException ioe) {
- return false;
- }
- }
+ };
+ }
- };
+ /**
+ * Turns an array of FileStatus into an array of Paths.
+ */
+ public static Path[] getPaths(FileStatus[] stats) {
+ if (stats == null) {
+ return null;
}
-
- /**
- * Turns an array of FileStatus into an array of Paths.
- */
- public static Path[] getPaths(FileStatus[] stats) {
- if (stats == null) {
- return null;
- }
- if (stats.length == 0) {
- return new Path[0];
- }
- Path[] res = new Path[stats.length];
- for (int i = 0; i < stats.length; i++) {
- res[i] = stats[i].getPath();
- }
- return res;
+ if (stats.length == 0) {
+ return new Path[0];
+ }
+ Path[] res = new Path[stats.length];
+ for (int i = 0; i < stats.length; i++) {
+ res[i] = stats[i].getPath();
}
+ return res;
+ }
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/Histogram.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/Histogram.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/Histogram.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/Histogram.java Fri Jan 9 06:34:33 2015
@@ -72,8 +72,8 @@ public class Histogram<E> {
}
public List<E> sortInverseByValue() {
- List<Map.Entry<E, HistogramEntry>> list =
- new Vector<Map.Entry<E, HistogramEntry>>(map.entrySet());
+ List<Map.Entry<E, HistogramEntry>> list = new Vector<Map.Entry<E, HistogramEntry>>(
+ map.entrySet());
// Sort the list using an annonymous inner class implementing Comparator for
// the compare method
@@ -93,8 +93,8 @@ public class Histogram<E> {
}
public List<E> sortByValue() {
- List<Map.Entry<E, HistogramEntry>> list =
- new Vector<Map.Entry<E, HistogramEntry>>(map.entrySet());
+ List<Map.Entry<E, HistogramEntry>> list = new Vector<Map.Entry<E, HistogramEntry>>(
+ map.entrySet());
// Sort the list using an annonymous inner class implementing Comparator for
// the compare method
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/IdentityPageReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/IdentityPageReducer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/IdentityPageReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/IdentityPageReducer.java Fri Jan 9 06:34:33 2015
@@ -21,12 +21,12 @@ import java.io.IOException;
import org.apache.nutch.storage.WebPage;
import org.apache.gora.mapreduce.GoraReducer;
-public class IdentityPageReducer
-extends GoraReducer<String, WebPage, String, WebPage> {
+public class IdentityPageReducer extends
+ GoraReducer<String, WebPage, String, WebPage> {
@Override
- protected void reduce(String key, Iterable<WebPage> values,
- Context context) throws IOException, InterruptedException {
+ protected void reduce(String key, Iterable<WebPage> values, Context context)
+ throws IOException, InterruptedException {
for (WebPage page : values) {
context.write(key, page);
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java Fri Jan 9 06:34:33 2015
@@ -28,22 +28,29 @@ import org.apache.hadoop.fs.Path;
* @author Andrzej Bialecki
*/
public class LockUtil {
-
+
/**
* Create a lock file.
- * @param fs filesystem
- * @param lockFile name of the lock file
- * @param accept if true, and the target file exists, consider it valid. If false
- * and the target file exists, throw an IOException.
- * @throws IOException if accept is false, and the target file already exists,
- * or if it's a directory.
+ *
+ * @param fs
+ * filesystem
+ * @param lockFile
+ * name of the lock file
+ * @param accept
+ * if true, and the target file exists, consider it valid. If false
+ * and the target file exists, throw an IOException.
+ * @throws IOException
+ * if accept is false, and the target file already exists, or if
+ * it's a directory.
*/
- public static void createLockFile(FileSystem fs, Path lockFile, boolean accept) throws IOException {
+ public static void createLockFile(FileSystem fs, Path lockFile, boolean accept)
+ throws IOException {
if (fs.exists(lockFile)) {
- if(!accept)
+ if (!accept)
throw new IOException("lock file " + lockFile + " already exists.");
if (fs.getFileStatus(lockFile).isDir())
- throw new IOException("lock file " + lockFile + " already exists and is a directory.");
+ throw new IOException("lock file " + lockFile
+ + " already exists and is a directory.");
// do nothing - the file already exists.
} else {
// make sure parents exist
@@ -55,16 +62,23 @@ public class LockUtil {
/**
* Remove lock file. NOTE: applications enforce the semantics of this file -
* this method simply removes any file with a given name.
- * @param fs filesystem
- * @param lockFile lock file name
+ *
+ * @param fs
+ * filesystem
+ * @param lockFile
+ * lock file name
* @return false, if the lock file doesn't exist. True, if it existed and was
- * successfully removed.
- * @throws IOException if lock file exists but it is a directory.
+ * successfully removed.
+ * @throws IOException
+ * if lock file exists but it is a directory.
*/
- public static boolean removeLockFile(FileSystem fs, Path lockFile) throws IOException {
- if (!fs.exists(lockFile)) return false;
+ public static boolean removeLockFile(FileSystem fs, Path lockFile)
+ throws IOException {
+ if (!fs.exists(lockFile))
+ return false;
if (fs.getFileStatus(lockFile).isDir())
- throw new IOException("lock file " + lockFile + " exists but is a directory!");
+ throw new IOException("lock file " + lockFile
+ + " exists but is a directory!");
return fs.delete(lockFile, false);
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java Fri Jan 9 06:34:33 2015
@@ -37,7 +37,7 @@ import org.apache.tika.mime.MimeTypesFac
// Slf4j logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
+
// imported for Javadoc
import org.apache.nutch.protocol.ProtocolOutput;
@@ -45,12 +45,12 @@ import org.apache.nutch.protocol.Protoco
* @author mattmann
* @since NUTCH-608
*
- * <p>
- * This is a facade class to insulate Nutch from its underlying Mime Type
- * substrate library, <a href="http://incubator.apache.org/tika/">Apache Tika</a>.
- * Any mime handling code should be placed in this utility class, and hidden
- * from the Nutch classes that rely on it.
- * </p>
+ * <p>
+ * This is a facade class to insulate Nutch from its underlying Mime Type
+ * substrate library, <a href="http://incubator.apache.org/tika/">Apache
+ * Tika</a>. Any mime handling code should be placed in this utility
+ * class, and hidden from the Nutch classes that rely on it.
+ * </p>
*/
public final class MimeUtil {
@@ -66,7 +66,8 @@ public final class MimeUtil {
private boolean mimeMagic;
/* our log stream */
- private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class.getName());
+ private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class
+ .getName());
public MimeUtil(Configuration conf) {
tika = new Tika();
@@ -75,25 +76,26 @@ public final class MimeUtil {
.getName());
if (mimeTypez == null) {
try {
- String customMimeTypeFile = conf.get("mime.types.file");
- if (customMimeTypeFile!=null && customMimeTypeFile.equals("")==false){
- try {
- mimeTypez = MimeTypesFactory.create(conf
- .getConfResourceAsInputStream(customMimeTypeFile));
- }
- catch (Exception e){
- LOG.error("Can't load mime.types.file : "+customMimeTypeFile+" using Tika's default");
- }
+ String customMimeTypeFile = conf.get("mime.types.file");
+ if (customMimeTypeFile != null
+ && customMimeTypeFile.equals("") == false) {
+ try {
+ mimeTypez = MimeTypesFactory.create(conf
+ .getConfResourceAsInputStream(customMimeTypeFile));
+ } catch (Exception e) {
+ LOG.error("Can't load mime.types.file : " + customMimeTypeFile
+ + " using Tika's default");
}
- if (mimeTypez==null)
- mimeTypez = MimeTypes.getDefaultMimeTypes();
+ }
+ if (mimeTypez == null)
+ mimeTypez = MimeTypes.getDefaultMimeTypes();
} catch (Exception e) {
- LOG.error("Exception in MimeUtil "+e.getMessage());
+ LOG.error("Exception in MimeUtil " + e.getMessage());
throw new RuntimeException(e);
}
objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
}
-
+
this.mimeTypes = mimeTypez;
this.mimeMagic = conf.getBoolean("mime.type.magic", true);
}
@@ -129,14 +131,13 @@ public final class MimeUtil {
/**
* A facade interface to trying all the possible mime type resolution
* strategies available within Tika. First, the mime type provided in
- * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
- * Then the cleaned mime type is looked up in the underlying Tika
- * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
- * is found, then that mime type is used, otherwise URL resolution is
- * used to try and determine the mime type. However, if
- * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
- * then mime type magic resolution is used to try and obtain a
- * better-than-the-default approximation of the {@link MimeType}.
+ * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then
+ * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes}
+ * registry, by its cleaned name. If the {@link MimeType} is found, then that
+ * mime type is used, otherwise URL resolution is used to try and determine
+ * the mime type. However, if <code>mime.type.magic</code> is enabled in
+ * {@link NutchConfiguration}, then mime type magic resolution is used to try
+ * and obtain a better-than-the-default approximation of the {@link MimeType}.
*
* @param typeName
* The original mime type, returned from a {@link ProtocolOutput}.
@@ -177,7 +178,7 @@ public final class MimeUtil {
throw new RuntimeException(e);
}
} else {
- retType = type.getName();
+ retType = type.getName();
}
// if magic is enabled use mime magic to guess if the mime type returned
@@ -195,14 +196,15 @@ public final class MimeUtil {
InputStream stream = TikaInputStream.get(data);
try {
magicType = tika.detect(stream, tikaMeta);
- } finally {
- stream.close();
+ } finally {
+ stream.close();
}
- } catch (IOException ignore) {}
+ } catch (IOException ignore) {
+ }
if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
- && !magicType.equals(MimeTypes.PLAIN_TEXT)
- && retType != null && !retType.equals(magicType)) {
+ && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null
+ && !retType.equals(magicType)) {
// If magic enabled and the current mime type differs from that of the
// one returned from the magic, take the magic mimeType
@@ -225,12 +227,12 @@ public final class MimeUtil {
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
* method.
- *
+ *
* @param url
* A string representation of the document {@link URL} to sense the
* {@link MimeType} for.
- * @return An appropriate {@link MimeType}, identified from the given
- * Document url in string form.
+ * @return An appropriate {@link MimeType}, identified from the given Document
+ * url in string form.
*/
public String getMimeType(String url) {
return tika.detect(url);
@@ -239,11 +241,11 @@ public final class MimeUtil {
/**
* A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
* method.
- *
+ *
* @param name
* The name of a valid {@link MimeType} in the Tika mime registry.
- * @return The object representation of the {@link MimeType}, if it exists,
- * or null otherwise.
+ * @return The object representation of the {@link MimeType}, if it exists, or
+ * null otherwise.
*/
public String forName(String name) {
try {
@@ -258,7 +260,7 @@ public final class MimeUtil {
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
* method.
- *
+ *
* @param f
* The {@link File} to sense the {@link MimeType} for.
* @return The {@link MimeType} of the given {@link File}, or null if it
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NodeWalker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NodeWalker.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NodeWalker.java Fri Jan 9 06:34:33 2015
@@ -22,13 +22,17 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
- * <p>A utility class that allows the walking of any DOM tree using a stack
- * instead of recursion. As the node tree is walked the next node is popped
- * off of the stack and all of its children are automatically added to the
- * stack to be called in tree order.</p>
+ * <p>
+ * A utility class that allows the walking of any DOM tree using a stack instead
+ * of recursion. As the node tree is walked the next node is popped off of the
+ * stack and all of its children are automatically added to the stack to be
+ * called in tree order.
+ * </p>
*
- * <p>Currently this class is not thread safe. It is assumed that only one
- * thread will be accessing the <code>NodeWalker</code> at any given time.</p>
+ * <p>
+ * Currently this class is not thread safe. It is assumed that only one thread
+ * will be accessing the <code>NodeWalker</code> at any given time.
+ * </p>
*/
public class NodeWalker {
@@ -36,7 +40,7 @@ public class NodeWalker {
private Node currentNode;
private NodeList currentChildren;
private Stack<Node> nodes;
-
+
/**
* Starts the <code>Node</code> tree from the root node.
*
@@ -47,69 +51,77 @@ public class NodeWalker {
nodes = new Stack<Node>();
nodes.add(rootNode);
}
-
+
/**
- * <p>Returns the next <code>Node</code> on the stack and pushes all of its
- * children onto the stack, allowing us to walk the node tree without the
- * use of recursion. If there are no more nodes on the stack then null is
- * returned.</p>
+ * <p>
+ * Returns the next <code>Node</code> on the stack and pushes all of its
+ * children onto the stack, allowing us to walk the node tree without the use
+ * of recursion. If there are no more nodes on the stack then null is
+ * returned.
+ * </p>
*
- * @return Node The next <code>Node</code> on the stack or null if there
- * isn't a next node.
+ * @return Node The next <code>Node</code> on the stack or null if there isn't
+ * a next node.
*/
public Node nextNode() {
-
+
// if no next node return null
if (!hasNext()) {
return null;
}
-
+
// pop the next node off of the stack and push all of its children onto
// the stack
currentNode = nodes.pop();
currentChildren = currentNode.getChildNodes();
int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
-
+
// put the children node on the stack in first to last order
for (int i = childLen - 1; i >= 0; i--) {
nodes.add(currentChildren.item(i));
}
-
+
return currentNode;
}
-
+
/**
- * <p>Skips over and removes from the node stack the children of the last
- * node. When getting a next node from the walker, that node's children
- * are automatically added to the stack. You can call this method to remove
- * those children from the stack.</p>
- *
- * <p>This is useful when you don't want to process deeper into the
- * current path of the node tree but you want to continue processing sibling
- * nodes.</p>
- *
+ * <p>
+ * Skips over and removes from the node stack the children of the last node.
+ * When getting a next node from the walker, that node's children are
+ * automatically added to the stack. You can call this method to remove those
+ * children from the stack.
+ * </p>
+ *
+ * <p>
+ * This is useful when you don't want to process deeper into the current path
+ * of the node tree but you want to continue processing sibling nodes.
+ * </p>
+ *
*/
public void skipChildren() {
-
+
int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
-
- for (int i = 0 ; i < childLen ; i++) {
+
+ for (int i = 0; i < childLen; i++) {
Node child = nodes.peek();
if (child.equals(currentChildren.item(i))) {
nodes.pop();
}
}
}
-
+
/**
* Return the current node.
+ *
* @return Node
*/
public Node getCurrentNode() {
return currentNode;
}
-
- /** * Returns true if there are more nodes on the current stack.
+
+ /**
+ * * Returns true if there are more nodes on the current stack.
+ *
* @return
*/
public boolean hasNext() {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchConfiguration.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchConfiguration.java Fri Jan 9 06:34:33 2015
@@ -23,37 +23,42 @@ import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
-
-/** Utility to create Hadoop {@link Configuration}s that include Nutch-specific
- * resources. */
+/**
+ * Utility to create Hadoop {@link Configuration}s that include Nutch-specific
+ * resources.
+ */
public class NutchConfiguration {
public static final String UUID_KEY = "nutch.conf.uuid";
-
- private NutchConfiguration() {} // singleton
-
+
+ private NutchConfiguration() {
+ } // singleton
+
/*
- * Configuration.hashCode() doesn't return values that
- * correspond to a unique set of parameters. This is a workaround
- * so that we can track instances of Configuration created by Nutch.
+ * Configuration.hashCode() doesn't return values that correspond to a unique
+ * set of parameters. This is a workaround so that we can track instances of
+ * Configuration created by Nutch.
*/
private static void setUUID(Configuration conf) {
UUID uuid = UUID.randomUUID();
conf.set(UUID_KEY, uuid.toString());
}
-
+
/**
- * Retrieve a Nutch UUID of this configuration object, or null
- * if the configuration was created elsewhere.
- * @param conf configuration instance
+ * Retrieve a Nutch UUID of this configuration object, or null if the
+ * configuration was created elsewhere.
+ *
+ * @param conf
+ * configuration instance
* @return uuid or null
*/
public static String getUUID(Configuration conf) {
return conf.get(UUID_KEY);
}
- /** Create a {@link Configuration} for Nutch. This will load the standard
- * Nutch resources, <code>nutch-default.xml</code> and
- * <code>nutch-site.xml</code> overrides.
+ /**
+ * Create a {@link Configuration} for Nutch. This will load the standard Nutch
+ * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code>
+ * overrides.
*/
public static Configuration create() {
Configuration conf = new Configuration();
@@ -61,14 +66,19 @@ public class NutchConfiguration {
addNutchResources(conf);
return conf;
}
-
- /** Create a {@link Configuration} from supplied properties.
- * @param addNutchResources if true, then first <code>nutch-default.xml</code>,
- * and then <code>nutch-site.xml</code> will be loaded prior to applying the
- * properties. Otherwise these resources won't be used.
- * @param nutchProperties a set of properties to define (or override)
+
+ /**
+ * Create a {@link Configuration} from supplied properties.
+ *
+ * @param addNutchResources
+ * if true, then first <code>nutch-default.xml</code>, and then
+ * <code>nutch-site.xml</code> will be loaded prior to applying the
+ * properties. Otherwise these resources won't be used.
+ * @param nutchProperties
+ * a set of properties to define (or override)
*/
- public static Configuration create(boolean addNutchResources, Properties nutchProperties) {
+ public static Configuration create(boolean addNutchResources,
+ Properties nutchProperties) {
Configuration conf = new Configuration();
setUUID(conf);
if (addNutchResources) {
@@ -83,8 +93,8 @@ public class NutchConfiguration {
/**
* Add the standard Nutch resources to {@link Configuration}.
*
- * @param conf Configuration object to which
- * configuration is to be added.
+ * @param conf
+ * Configuration object to which configuration is to be added.
*/
private static Configuration addNutchResources(Configuration conf) {
conf.addResource("nutch-default.xml");
@@ -92,4 +102,3 @@ public class NutchConfiguration {
return conf;
}
}
-
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java Fri Jan 9 06:34:33 2015
@@ -35,10 +35,10 @@ public class NutchJob extends Job {
public NutchJob(Configuration conf, String jobName) throws IOException {
super(conf, jobName);
- //prefix jobName with crawlId if not empty
+ // prefix jobName with crawlId if not empty
String crawlId = conf.get("storage.crawl.id");
if (!StringUtils.isEmpty(crawlId)) {
- jobName = "["+crawlId+"]"+jobName;
+ jobName = "[" + crawlId + "]" + jobName;
setJobName(jobName);
}
setJarByClass(this.getClass());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJobConf.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJobConf.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJobConf.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJobConf.java Fri Jan 9 06:34:33 2015
@@ -20,7 +20,7 @@ package org.apache.nutch.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
-/** A {@link JobConf} for Nutch jobs. */
+/** A {@link JobConf} for Nutch jobs. */
public class NutchJobConf extends JobConf {
public NutchJobConf(Configuration conf) {
@@ -28,4 +28,3 @@ public class NutchJobConf extends JobCon
}
}
-
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchTool.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchTool.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchTool.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchTool.java Fri Jan 9 06:34:33 2015
@@ -26,19 +26,20 @@ import org.apache.hadoop.mapreduce.Job;
import org.apache.nutch.metadata.Nutch;
public abstract class NutchTool extends Configured {
-
- protected HashMap<String,Object> results = new HashMap<String,Object>();
- protected Map<String,Object> status =
- Collections.synchronizedMap(new HashMap<String,Object>());
+
+ protected HashMap<String, Object> results = new HashMap<String, Object>();
+ protected Map<String, Object> status = Collections
+ .synchronizedMap(new HashMap<String, Object>());
protected Job currentJob;
protected int numJobs;
protected int currentJobNum;
-
- /** Runs the tool, using a map of arguments.
- * May return results, or null.
+
+ /**
+ * Runs the tool, using a map of arguments. May return results, or null.
*/
- public abstract Map<String,Object> run(Map<String,Object> args) throws Exception;
-
+ public abstract Map<String, Object> run(Map<String, Object> args)
+ throws Exception;
+
/** Returns relative progress of the tool, a float in range [0,1]. */
public float getProgress() {
float res = 0;
@@ -55,29 +56,31 @@ public abstract class NutchTool extends
}
// take into account multiple jobs
if (numJobs > 1) {
- res = (currentJobNum + res) / (float)numJobs;
+ res = (currentJobNum + res) / (float) numJobs;
}
status.put(Nutch.STAT_PROGRESS, res);
return res;
}
-
-
+
/** Returns current status of the running tool. */
- public Map<String,Object> getStatus() {
+ public Map<String, Object> getStatus() {
return status;
}
-
- /** Stop the job with the possibility to resume. Subclasses should
- * override this, since by default it calls {@link #killJob()}.
+
+ /**
+ * Stop the job with the possibility to resume. Subclasses should override
+ * this, since by default it calls {@link #killJob()}.
+ *
* @return true if succeeded, false otherwise
*/
public boolean stopJob() throws Exception {
return killJob();
}
-
+
/**
- * Kill the job immediately. Clients should assume that any results
- * that the job produced so far are in inconsistent state or missing.
+ * Kill the job immediately. Clients should assume that any results that the
+ * job produced so far are in inconsistent state or missing.
+ *
* @return true if succeeded, false otherwise.
* @throws Exception
*/
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/ObjectCache.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/ObjectCache.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/ObjectCache.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/ObjectCache.java Fri Jan 9 06:34:33 2015
@@ -24,35 +24,33 @@ import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
public class ObjectCache {
-
+
private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class);
-
- private static final WeakHashMap<Configuration, ObjectCache> CACHE =
- new WeakHashMap<Configuration, ObjectCache>();
+
+ private static final WeakHashMap<Configuration, ObjectCache> CACHE = new WeakHashMap<Configuration, ObjectCache>();
private final HashMap<String, Object> objectMap;
-
+
private ObjectCache() {
objectMap = new HashMap<String, Object>();
}
-
+
public static ObjectCache get(Configuration conf) {
ObjectCache objectCache = CACHE.get(conf);
if (objectCache == null) {
- LOG.debug("No object cache found for conf=" + conf
- + ", instantiating a new object cache");
+ LOG.debug("No object cache found for conf=" + conf
+ + ", instantiating a new object cache");
objectCache = new ObjectCache();
CACHE.put(conf, objectCache);
}
return objectCache;
}
-
+
public Object getObject(String key) {
return objectMap.get(key);
}
-
+
public void setObject(String key, Object value) {
objectMap.put(key, value);
}
}
-
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/PrefixStringMatcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/PrefixStringMatcher.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/PrefixStringMatcher.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/PrefixStringMatcher.java Fri Jan 9 06:34:33 2015
@@ -21,46 +21,47 @@ import java.util.Collection;
import java.util.Iterator;
/**
- * A class for efficiently matching <code>String</code>s against a set
- * of prefixes.
+ * A class for efficiently matching <code>String</code>s against a set of
+ * prefixes.
*/
public class PrefixStringMatcher extends TrieStringMatcher {
/**
* Creates a new <code>PrefixStringMatcher</code> which will match
- * <code>String</code>s with any prefix in the supplied array.
- * Zero-length <code>Strings</code> are ignored.
+ * <code>String</code>s with any prefix in the supplied array. Zero-length
+ * <code>Strings</code> are ignored.
*/
public PrefixStringMatcher(String[] prefixes) {
super();
- for (int i= 0; i < prefixes.length; i++)
+ for (int i = 0; i < prefixes.length; i++)
addPatternForward(prefixes[i]);
}
/**
* Creates a new <code>PrefixStringMatcher</code> which will match
- * <code>String</code>s with any prefix in the supplied
+ * <code>String</code>s with any prefix in the supplied
* <code>Collection</code>.
- *
- * @throws ClassCastException if any <code>Object</code>s in the
- * collection are not <code>String</code>s
+ *
+ * @throws ClassCastException
+ * if any <code>Object</code>s in the collection are not
+ * <code>String</code>s
*/
public PrefixStringMatcher(Collection<String> prefixes) {
super();
- Iterator<String> iter= prefixes.iterator();
+ Iterator<String> iter = prefixes.iterator();
while (iter.hasNext())
addPatternForward(iter.next());
}
/**
- * Returns true if the given <code>String</code> is matched by a
- * prefix in the trie
+ * Returns true if the given <code>String</code> is matched by a prefix in the
+ * trie
*/
public boolean matches(String input) {
- TrieNode node= root;
- for (int i= 0; i < input.length(); i++) {
- node= node.getChild(input.charAt(i));
- if (node == null)
+ TrieNode node = root;
+ for (int i = 0; i < input.length(); i++) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
return false;
if (node.isTerminal())
return true;
@@ -73,13 +74,13 @@ public class PrefixStringMatcher extends
* or <code>null<code> if no match exists.
*/
public String shortestMatch(String input) {
- TrieNode node= root;
- for (int i= 0; i < input.length(); i++) {
- node= node.getChild(input.charAt(i));
- if (node == null)
+ TrieNode node = root;
+ for (int i = 0; i < input.length(); i++) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
return null;
if (node.isTerminal())
- return input.substring(0, i+1);
+ return input.substring(0, i + 1);
}
return null;
}
@@ -89,29 +90,26 @@ public class PrefixStringMatcher extends
* or <code>null<code> if no match exists.
*/
public String longestMatch(String input) {
- TrieNode node= root;
- String result= null;
- for (int i= 0; i < input.length(); i++) {
- node= node.getChild(input.charAt(i));
- if (node == null)
+ TrieNode node = root;
+ String result = null;
+ for (int i = 0; i < input.length(); i++) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
break;
if (node.isTerminal())
- result= input.substring(0, i+1);
+ result = input.substring(0, i + 1);
}
return result;
}
public static final void main(String[] argv) {
- PrefixStringMatcher matcher=
- new PrefixStringMatcher(
- new String[]
- {"abcd", "abc", "aac", "baz", "foo", "foobar"} );
-
- String[] tests= {"a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
- "aaccca", "abaz", "baz", "bazooka", "fo", "foobar",
- "kite", };
+ PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
+ "abcd", "abc", "aac", "baz", "foo", "foobar" });
- for (int i= 0; i < tests.length; i++) {
+ String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
+ "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+ for (int i = 0; i < tests.length; i++) {
System.out.println("testing: " + tests[i]);
System.out.println(" matches: " + matcher.matches(tests[i]));
System.out.println(" shortest: " + matcher.shortestMatch(tests[i]));
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java Fri Jan 9 06:34:33 2015
@@ -20,42 +20,42 @@ package org.apache.nutch.util;
import java.nio.ByteBuffer;
/**
- * A collection of String processing utility methods.
+ * A collection of String processing utility methods.
*/
public class StringUtil {
/**
- * Returns a copy of <code>s</code> padded with trailing spaces so
- * that it's length is <code>length</code>. Strings already
- * <code>length</code> characters long or longer are not altered.
+ * Returns a copy of <code>s</code> padded with trailing spaces so that it's
+ * length is <code>length</code>. Strings already <code>length</code>
+ * characters long or longer are not altered.
*/
public static String rightPad(String s, int length) {
- StringBuffer sb= new StringBuffer(s);
- for (int i= length - s.length(); i > 0; i--)
+ StringBuffer sb = new StringBuffer(s);
+ for (int i = length - s.length(); i > 0; i--)
sb.append(" ");
return sb.toString();
}
/**
- * Returns a copy of <code>s</code> padded with leading spaces so
- * that it's length is <code>length</code>. Strings already
- * <code>length</code> characters long or longer are not altered.
+ * Returns a copy of <code>s</code> padded with leading spaces so that it's
+ * length is <code>length</code>. Strings already <code>length</code>
+ * characters long or longer are not altered.
*/
public static String leftPad(String s, int length) {
- StringBuffer sb= new StringBuffer();
- for (int i= length - s.length(); i > 0; i--)
+ StringBuffer sb = new StringBuffer();
+ for (int i = length - s.length(); i > 0; i--)
sb.append(" ");
sb.append(s);
return sb.toString();
}
-
- private static final char[] HEX_DIGITS =
- {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+ private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6',
+ '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
/**
* Convenience call for {@link #toHexString(ByteBuffer, String, int)}, where
* <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+ *
* @param buf
*/
public static String toHexString(ByteBuffer buf) {
@@ -65,19 +65,25 @@ public class StringUtil {
/**
* Get a text representation of a ByteBuffer as hexadecimal String, where each
* pair of hexadecimal digits corresponds to consecutive bytes in the array.
- * @param buf input data
- * @param sep separate every pair of hexadecimal digits with this separator, or
- * null if no separation is needed.
- * @param lineLen break the output String into lines containing output for lineLen
- * bytes.
+ *
+ * @param buf
+ * input data
+ * @param sep
+ * separate every pair of hexadecimal digits with this separator, or
+ * null if no separation is needed.
+ * @param lineLen
+ * break the output String into lines containing output for lineLen
+ * bytes.
*/
public static String toHexString(ByteBuffer buf, String sep, int lineLen) {
- return toHexString(buf.array(), buf.arrayOffset() + buf.position(), buf.remaining(), sep, lineLen);
+ return toHexString(buf.array(), buf.arrayOffset() + buf.position(),
+ buf.remaining(), sep, lineLen);
}
/**
* Convenience call for {@link #toHexString(byte[], String, int)}, where
* <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+ *
* @param buf
*/
public static String toHexString(byte[] buf) {
@@ -87,11 +93,15 @@ public class StringUtil {
/**
* Get a text representation of a byte[] as hexadecimal String, where each
* pair of hexadecimal digits corresponds to consecutive bytes in the array.
- * @param buf input data
- * @param sep separate every pair of hexadecimal digits with this separator, or
- * null if no separation is needed.
- * @param lineLen break the output String into lines containing output for lineLen
- * bytes.
+ *
+ * @param buf
+ * input data
+ * @param sep
+ * separate every pair of hexadecimal digits with this separator, or
+ * null if no separation is needed.
+ * @param lineLen
+ * break the output String into lines containing output for lineLen
+ * bytes.
*/
public static String toHexString(byte[] buf, String sep, int lineLen) {
return toHexString(buf, 0, buf.length, sep, lineLen);
@@ -100,39 +110,53 @@ public class StringUtil {
/**
* Get a text representation of a byte[] as hexadecimal String, where each
* pair of hexadecimal digits corresponds to consecutive bytes in the array.
- * @param buf input data
- * @param of the offset into the byte[] to start reading
- * @param cb the number of bytes to read from the byte[]
- * @param sep separate every pair of hexadecimal digits with this separator, or
- * null if no separation is needed.
- * @param lineLen break the output String into lines containing output for lineLen
- * bytes.
- */
- public static String toHexString(byte[] buf, int of, int cb, String sep, int lineLen) {
- if (buf == null) return null;
- if (lineLen <= 0) lineLen = Integer.MAX_VALUE;
+ *
+ * @param buf
+ * input data
+ * @param of
+ * the offset into the byte[] to start reading
+ * @param cb
+ * the number of bytes to read from the byte[]
+ * @param sep
+ * separate every pair of hexadecimal digits with this separator, or
+ * null if no separation is needed.
+ * @param lineLen
+ * break the output String into lines containing output for lineLen
+ * bytes.
+ */
+ public static String toHexString(byte[] buf, int of, int cb, String sep,
+ int lineLen) {
+ if (buf == null)
+ return null;
+ if (lineLen <= 0)
+ lineLen = Integer.MAX_VALUE;
StringBuffer res = new StringBuffer(cb * 2);
for (int c = 0; c < cb; c++) {
int b = buf[of++];
res.append(HEX_DIGITS[(b >> 4) & 0xf]);
res.append(HEX_DIGITS[b & 0xf]);
- if (c > 0 && (c % lineLen) == 0) res.append('\n');
- else if (sep != null && c < lineLen - 1) res.append(sep);
+ if (c > 0 && (c % lineLen) == 0)
+ res.append('\n');
+ else if (sep != null && c < lineLen - 1)
+ res.append(sep);
}
return res.toString();
}
-
+
/**
* Convert a String containing consecutive (no inside whitespace) hexadecimal
- * digits into a corresponding byte array. If the number of digits is not even,
- * a '0' will be appended in the front of the String prior to conversion.
- * Leading and trailing whitespace is ignored.
- * @param text input text
+ * digits into a corresponding byte array. If the number of digits is not
+ * even, a '0' will be appended in the front of the String prior to
+ * conversion. Leading and trailing whitespace is ignored.
+ *
+ * @param text
+ * input text
* @return converted byte array, or null if unable to convert
*/
public static byte[] fromHexString(String text) {
text = text.trim();
- if (text.length() % 2 != 0) text = "0" + text;
+ if (text.length() % 2 != 0)
+ text = "0" + text;
int resLen = text.length() / 2;
int loNibble, hiNibble;
byte[] res = new byte[resLen];
@@ -140,12 +164,13 @@ public class StringUtil {
int j = i << 1;
hiNibble = charToNibble(text.charAt(j));
loNibble = charToNibble(text.charAt(j + 1));
- if (loNibble == -1 || hiNibble == -1) return null;
- res[i] = (byte)(hiNibble << 4 | loNibble);
+ if (loNibble == -1 || hiNibble == -1)
+ return null;
+ res[i] = (byte) (hiNibble << 4 | loNibble);
}
return res;
}
-
+
private static final int charToNibble(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
@@ -164,11 +189,12 @@ public class StringUtil {
public static boolean isEmpty(String str) {
return (str == null) || (str.equals(""));
}
-
/**
* Takes in a String value and cleans out any offending "�"
- * @param value the dirty String value.
+ *
+ * @param value
+ * the dirty String value.
* @return clean String
*/
public static String cleanField(String value) {
@@ -178,8 +204,8 @@ public class StringUtil {
public static void main(String[] args) {
if (args.length != 1)
System.out.println("Usage: StringUtil <encoding name>");
- else
- System.out.println(args[0] + " is resolved to " +
- EncodingDetector.resolveEncodingAlias(args[0]));
+ else
+ System.out.println(args[0] + " is resolved to "
+ + EncodingDetector.resolveEncodingAlias(args[0]));
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/SuffixStringMatcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/SuffixStringMatcher.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/SuffixStringMatcher.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/SuffixStringMatcher.java Fri Jan 9 06:34:33 2015
@@ -21,8 +21,8 @@ import java.util.Collection;
import java.util.Iterator;
/**
- * A class for efficiently matching <code>String</code>s against a set
- * of suffixes. Zero-length <code>Strings</code> are ignored.
+ * A class for efficiently matching <code>String</code>s against a set of
+ * suffixes. Zero-length <code>Strings</code> are ignored.
*/
public class SuffixStringMatcher extends TrieStringMatcher {
@@ -32,7 +32,7 @@ public class SuffixStringMatcher extends
*/
public SuffixStringMatcher(String[] suffixes) {
super();
- for (int i= 0; i < suffixes.length; i++)
+ for (int i = 0; i < suffixes.length; i++)
addPatternBackward(suffixes[i]);
}
@@ -49,14 +49,14 @@ public class SuffixStringMatcher extends
}
/**
- * Returns true if the given <code>String</code> is matched by a
- * suffix in the trie
+ * Returns true if the given <code>String</code> is matched by a suffix in the
+ * trie
*/
public boolean matches(String input) {
- TrieNode node= root;
- for (int i= input.length() - 1; i >= 0; i--) {
- node= node.getChild(input.charAt(i));
- if (node == null)
+ TrieNode node = root;
+ for (int i = input.length() - 1; i >= 0; i--) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
return false;
if (node.isTerminal())
return true;
@@ -64,16 +64,15 @@ public class SuffixStringMatcher extends
return false;
}
-
/**
* Returns the shortest suffix of <code>input<code> that is matched,
* or <code>null<code> if no match exists.
*/
public String shortestMatch(String input) {
- TrieNode node= root;
- for (int i= input.length() - 1; i >= 0; i--) {
- node= node.getChild(input.charAt(i));
- if (node == null)
+ TrieNode node = root;
+ for (int i = input.length() - 1; i >= 0; i--) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
return null;
if (node.isTerminal())
return input.substring(i);
@@ -86,29 +85,26 @@ public class SuffixStringMatcher extends
* or <code>null<code> if no match exists.
*/
public String longestMatch(String input) {
- TrieNode node= root;
- String result= null;
- for (int i= input.length() - 1; i >= 0; i--) {
- node= node.getChild(input.charAt(i));
- if (node == null)
+ TrieNode node = root;
+ String result = null;
+ for (int i = input.length() - 1; i >= 0; i--) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
break;
if (node.isTerminal())
- result= input.substring(i);
+ result = input.substring(i);
}
return result;
}
public static final void main(String[] argv) {
- SuffixStringMatcher matcher=
- new SuffixStringMatcher(
- new String[]
- {"a", "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar"} );
-
- String[] tests= {"a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
- "aaccca", "abaz", "baz", "bazooka", "fo", "foobar",
- "kite", };
+ SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a",
+ "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" });
+
+ String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
+ "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
- for (int i= 0; i < tests.length; i++) {
+ for (int i = 0; i < tests.length; i++) {
System.out.println("testing: " + tests[i]);
System.out.println(" matches: " + matcher.matches(tests[i]));
System.out.println(" shortest: " + matcher.shortestMatch(tests[i]));
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java Fri Jan 9 06:34:33 2015
@@ -33,7 +33,7 @@ public class TableUtil {
* <p>
* E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
* "com.foo.bar:8983:http/to/index.html?a=b".
- *
+ *
* @param url
* url to be reversed
* @return Reversed url
@@ -50,7 +50,7 @@ public class TableUtil {
* <p>
* E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
* "com.foo.bar:http:8983/to/index.html?a=b".
- *
+ *
* @param url
* url to be reversed
* @return Reversed url
@@ -93,8 +93,11 @@ public class TableUtil {
pathBegin = reversedUrl.length();
String sub = reversedUrl.substring(0, pathBegin);
- String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed host>, <port>, <protocol>}
-
+ String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed
+ // host>,
+ // <port>,
+ // <protocol>}
+
buf.append(splits[1]); // add protocol
buf.append("://");
reverseAppendSplits(splits[0], buf); // splits[0] is reversed
@@ -110,7 +113,7 @@ public class TableUtil {
/**
* Given a reversed url, returns the reversed host E.g
* "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
- *
+ *
* @param reversedUrl
* Reversed url
* @return Reversed host
@@ -120,7 +123,7 @@ public class TableUtil {
}
private static void reverseAppendSplits(String string, StringBuilder buf) {
- String[] splits = StringUtils.split(string,'.');
+ String[] splits = StringUtils.split(string, '.');
if (splits.length > 0) {
for (int i = splits.length - 1; i > 0; i--) {
buf.append(splits[i]);
@@ -136,18 +139,18 @@ public class TableUtil {
StringBuilder buf = new StringBuilder();
reverseAppendSplits(hostName, buf);
return buf.toString();
-
+
}
+
public static String unreverseHost(String reversedHostName) {
return reverseHost(reversedHostName); // Reversible
}
-
-
+
/**
- * Convert given Utf8 instance to String and and cleans out
- * any offending "�" from the String.
- *
- *
+ * Convert given Utf8 instance to String and and cleans out any offending "�"
+ * from the String.
+ *
+ *
* @param utf8
* Utf8 object
* @return string-ifed Utf8 object or null if Utf8 instance is null
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/TimingUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/TimingUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/TimingUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/TimingUtil.java Fri Jan 9 06:34:33 2015
@@ -21,35 +21,39 @@ import java.text.NumberFormat;
public class TimingUtil {
- private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
+ private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
- /**
- * Calculate the elapsed time between two times specified in milliseconds.
- * @param start The start of the time period
- * @param end The end of the time period
- * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y minutes and Z seconds or null if start > end.
- */
- public static String elapsedTime(long start, long end){
- if (start > end) {
- return null;
- }
-
- long[] elapsedTime = new long[TIME_FACTOR.length];
-
- for (int i = 0; i < TIME_FACTOR.length; i++) {
- elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
- start += TIME_FACTOR[i] * elapsedTime[i];
- }
-
- NumberFormat nf = NumberFormat.getInstance();
- nf.setMinimumIntegerDigits(2);
- StringBuffer buf = new StringBuffer();
- for (int i = 0; i < elapsedTime.length; i++) {
- if (i > 0) {
- buf.append(":");
- }
- buf.append(nf.format(elapsedTime[i]));
- }
- return buf.toString();
+ /**
+ * Calculate the elapsed time between two times specified in milliseconds.
+ *
+ * @param start
+ * The start of the time period
+ * @param end
+ * The end of the time period
+ * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
+ * minutes and Z seconds or null if start > end.
+ */
+ public static String elapsedTime(long start, long end) {
+ if (start > end) {
+ return null;
}
+
+ long[] elapsedTime = new long[TIME_FACTOR.length];
+
+ for (int i = 0; i < TIME_FACTOR.length; i++) {
+ elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
+ start += TIME_FACTOR[i] * elapsedTime[i];
+ }
+
+ NumberFormat nf = NumberFormat.getInstance();
+ nf.setMinimumIntegerDigits(2);
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < elapsedTime.length; i++) {
+ if (i > 0) {
+ buf.append(":");
+ }
+ buf.append(nf.format(elapsedTime[i]));
+ }
+ return buf.toString();
+ }
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/ToolUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/ToolUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/ToolUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/ToolUtil.java Fri Jan 9 06:34:33 2015
@@ -28,14 +28,14 @@ import org.apache.nutch.metadata.Nutch;
public class ToolUtil {
- public static final Map<String,Object> toArgMap(Object... args) {
+ public static final Map<String, Object> toArgMap(Object... args) {
if (args == null) {
return null;
}
if (args.length % 2 != 0) {
throw new RuntimeException("expected pairs of argName argValue");
}
- HashMap<String,Object> res = new HashMap<String,Object>();
+ HashMap<String, Object> res = new HashMap<String, Object>();
for (int i = 0; i < args.length; i += 2) {
if (args[i + 1] != null) {
res.put(String.valueOf(args[i]), args[i + 1]);
@@ -43,20 +43,22 @@ public class ToolUtil {
}
return res;
}
-
+
@SuppressWarnings("unchecked")
- public static final void recordJobStatus(String label, Job job, Map<String,Object> results) {
- Map<String,Object> jobs = (Map<String,Object>)results.get(Nutch.STAT_JOBS);
+ public static final void recordJobStatus(String label, Job job,
+ Map<String, Object> results) {
+ Map<String, Object> jobs = (Map<String, Object>) results
+ .get(Nutch.STAT_JOBS);
if (jobs == null) {
- jobs = new LinkedHashMap<String,Object>();
+ jobs = new LinkedHashMap<String, Object>();
results.put(Nutch.STAT_JOBS, jobs);
}
- Map<String,Object> stats = new HashMap<String,Object>();
- Map<String,Object> countStats = new HashMap<String,Object>();
+ Map<String, Object> stats = new HashMap<String, Object>();
+ Map<String, Object> countStats = new HashMap<String, Object>();
try {
Counters counters = job.getCounters();
for (CounterGroup cg : counters) {
- Map<String,Object> cnts = new HashMap<String,Object>();
+ Map<String, Object> cnts = new HashMap<String, Object>();
countStats.put(cg.getDisplayName(), cnts);
for (Counter c : cg) {
cnts.put(c.getName(), c.getValue());