You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [9/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/pr...
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java Thu Jan 29 05:38:59 2015
@@ -1,19 +1,19 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.nutch.scoring;
/**
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Thu Jan 29 05:38:59 2015
@@ -49,7 +49,8 @@ public class ScoringFilters extends Conf
}
/** Calculate a sort value for Generate. */
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
initSort = this.filters[i].generatorSortValue(url, datum, initSort);
}
@@ -57,48 +58,59 @@ public class ScoringFilters extends Conf
}
/** Calculate a new initial score, used when adding newly discovered pages. */
- public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].initialScore(url, datum);
}
}
/** Calculate a new initial score, used when injecting new pages. */
- public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].injectedScore(url, datum);
}
}
/** Calculate updated page score during CrawlDb.update(). */
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].updateDbScore(url, old, datum, inlinked);
}
}
- public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException {
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+ throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].passScoreBeforeParsing(url, datum, content);
}
}
-
- public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException {
+
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].passScoreAfterParsing(url, content, parse);
}
}
-
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
+
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
- adjust = this.filters[i].distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
+ adjust = this.filters[i].distributeScoreToOutlinks(fromUrl, parseData,
+ targets, adjust, allCount);
}
return adjust;
}
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
- initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum, parse, inlinks, initScore);
+ initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum,
+ parse, inlinks, initScore);
}
return initScore;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* The {@link org.apache.nutch.scoring.ScoringFilter ScoringFilter} interface.
*/
package org.apache.nutch.scoring;
+
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java Thu Jan 29 05:38:59 2015
@@ -27,8 +27,7 @@ import org.apache.hadoop.io.Writable;
* A class for holding link information including the url, anchor text, a score,
* the timestamp of the link and a link type.
*/
-public class LinkDatum
- implements Writable {
+public class LinkDatum implements Writable {
public final static byte INLINK = 1;
public final static byte OUTLINK = 2;
@@ -49,7 +48,8 @@ public class LinkDatum
/**
* Creates a LinkDatum with a given url. Timestamp is set to current time.
*
- * @param url The link url.
+ * @param url
+ * The link url.
*/
public LinkDatum(String url) {
this(url, "", System.currentTimeMillis());
@@ -59,8 +59,10 @@ public class LinkDatum
* Creates a LinkDatum with a url and an anchor text. Timestamp is set to
* current time.
*
- * @param url The link url.
- * @param anchor The link anchor text.
+ * @param url
+ * The link url.
+ * @param anchor
+ * The link anchor text.
*/
public LinkDatum(String url, String anchor) {
this(url, anchor, System.currentTimeMillis());
@@ -112,8 +114,7 @@ public class LinkDatum
this.linkType = linkType;
}
- public void readFields(DataInput in)
- throws IOException {
+ public void readFields(DataInput in) throws IOException {
url = Text.readString(in);
anchor = Text.readString(in);
score = in.readFloat();
@@ -121,8 +122,7 @@ public class LinkDatum
linkType = in.readByte();
}
- public void write(DataOutput out)
- throws IOException {
+ public void write(DataOutput out) throws IOException {
Text.writeString(out, url);
Text.writeString(out, anchor != null ? anchor : "");
out.writeFloat(score);
@@ -132,9 +132,9 @@ public class LinkDatum
public String toString() {
- String type = (linkType == INLINK ? "inlink" : (linkType == OUTLINK)
- ? "outlink" : "unknown");
+ String type = (linkType == INLINK ? "inlink"
+ : (linkType == OUTLINK) ? "outlink" : "unknown");
return "url: " + url + ", anchor: " + anchor + ", score: " + score
- + ", timestamp: " + timestamp + ", link type: " + type;
+ + ", timestamp: " + timestamp + ", link type: " + type;
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu Jan 29 05:38:59 2015
@@ -67,27 +67,24 @@ import org.apache.nutch.util.TimingUtil;
/**
* The LinkDumper tool creates a database of node to inlink information that can
- * be read using the nested Reader class. This allows the inlink and scoring
- * state of a single url to be reviewed quickly to determine why a given url is
- * ranking a certain way. This tool is to be used with the LinkRank analysis.
+ * be read using the nested Reader class. This allows the inlink and scoring
+ * state of a single url to be reviewed quickly to determine why a given url is
+ * ranking a certain way. This tool is to be used with the LinkRank analysis.
*/
-public class LinkDumper
- extends Configured
- implements Tool {
+public class LinkDumper extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(LinkDumper.class);
public static final String DUMP_DIR = "linkdump";
/**
- * Reader class which will print out the url and all of its inlinks to system
- * out. Each inlinkwill be displayed with its node information including
- * score and number of in and outlinks.
+ * Reader class which will print out the url and all of its inlinks to system
+ * out. Each inlinkwill be displayed with its node information including score
+ * and number of in and outlinks.
*/
public static class Reader {
- public static void main(String[] args)
- throws Exception {
-
+ public static void main(String[] args) throws Exception {
+
if (args == null || args.length < 2) {
System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
return;
@@ -99,20 +96,20 @@ public class LinkDumper
Path webGraphDb = new Path(args[0]);
String url = args[1];
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
- webGraphDb, DUMP_DIR), conf);
+ webGraphDb, DUMP_DIR), conf);
// get the link nodes for the url
Text key = new Text(url);
LinkNodes nodes = new LinkNodes();
MapFileOutputFormat.getEntry(readers,
- new HashPartitioner<Text, LinkNodes>(), key, nodes);
+ new HashPartitioner<Text, LinkNodes>(), key, nodes);
// print out the link nodes
LinkNode[] linkNodesAr = nodes.getLinks();
System.out.println(url + ":");
for (LinkNode node : linkNodesAr) {
System.out.println(" " + node.getUrl() + " - "
- + node.getNode().toString());
+ + node.getNode().toString());
}
// close the readers
@@ -123,8 +120,7 @@ public class LinkDumper
/**
* Bean class which holds url to node information.
*/
- public static class LinkNode
- implements Writable {
+ public static class LinkNode implements Writable {
private String url = null;
private Node node = null;
@@ -154,15 +150,13 @@ public class LinkDumper
this.node = node;
}
- public void readFields(DataInput in)
- throws IOException {
+ public void readFields(DataInput in) throws IOException {
url = in.readUTF();
node = new Node();
node.readFields(in);
}
- public void write(DataOutput out)
- throws IOException {
+ public void write(DataOutput out) throws IOException {
out.writeUTF(url);
node.write(out);
}
@@ -172,8 +166,7 @@ public class LinkDumper
/**
* Writable class which holds an array of LinkNode objects.
*/
- public static class LinkNodes
- implements Writable {
+ public static class LinkNodes implements Writable {
private LinkNode[] links;
@@ -193,8 +186,7 @@ public class LinkDumper
this.links = links;
}
- public void readFields(DataInput in)
- throws IOException {
+ public void readFields(DataInput in) throws IOException {
int numLinks = in.readInt();
if (numLinks > 0) {
links = new LinkNode[numLinks];
@@ -206,8 +198,7 @@ public class LinkDumper
}
}
- public void write(DataOutput out)
- throws IOException {
+ public void write(DataOutput out) throws IOException {
if (links != null && links.length > 0) {
int numLinks = links.length;
out.writeInt(numLinks);
@@ -222,9 +213,9 @@ public class LinkDumper
* Inverts outlinks from the WebGraph to inlinks and attaches node
* information.
*/
- public static class Inverter
- implements Mapper<Text, Writable, Text, ObjectWritable>,
- Reducer<Text, ObjectWritable, Text, LinkNode> {
+ public static class Inverter implements
+ Mapper<Text, Writable, Text, ObjectWritable>,
+ Reducer<Text, ObjectWritable, Text, LinkNode> {
private JobConf conf;
@@ -236,8 +227,8 @@ public class LinkDumper
* Wraps all values in ObjectWritables.
*/
public void map(Text key, Writable value,
- OutputCollector<Text, ObjectWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+ throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
@@ -245,12 +236,12 @@ public class LinkDumper
}
/**
- * Inverts outlinks to inlinks while attaching node information to the
+ * Inverts outlinks to inlinks while attaching node information to the
* outlink.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
- OutputCollector<Text, LinkNode> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, LinkNode> output, Reporter reporter)
+ throws IOException {
String fromUrl = key.toString();
List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
@@ -262,13 +253,11 @@ public class LinkDumper
ObjectWritable write = values.next();
Object obj = write.get();
if (obj instanceof Node) {
- node = (Node)obj;
- }
- else if (obj instanceof LinkDatum) {
- outlinks.add(WritableUtils.clone((LinkDatum)obj, conf));
- }
- else if (obj instanceof LoopSet) {
- loops = (LoopSet)obj;
+ node = (Node) obj;
+ } else if (obj instanceof LinkDatum) {
+ outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
+ } else if (obj instanceof LoopSet) {
+ loops = (LoopSet) obj;
}
}
@@ -280,13 +269,13 @@ public class LinkDumper
for (int i = 0; i < outlinks.size(); i++) {
LinkDatum outlink = outlinks.get(i);
String toUrl = outlink.getUrl();
-
+
// remove any url that is in the loopset, same as LinkRank
if (loopSet != null && loopSet.contains(toUrl)) {
continue;
}
-
- // collect the outlink as an inlink with the node
+
+ // collect the outlink as an inlink with the node
output.collect(new Text(toUrl), new LinkNode(fromUrl, node));
}
}
@@ -297,11 +286,11 @@ public class LinkDumper
}
/**
- * Merges LinkNode objects into a single array value per url. This allows
- * all values to be quickly retrieved and printed via the Reader tool.
+ * Merges LinkNode objects into a single array value per url. This allows all
+ * values to be quickly retrieved and printed via the Reader tool.
*/
- public static class Merger
- implements Reducer<Text, LinkNode, Text, LinkNodes> {
+ public static class Merger implements
+ Reducer<Text, LinkNode, Text, LinkNodes> {
private JobConf conf;
private int maxInlinks = 50000;
@@ -314,8 +303,8 @@ public class LinkDumper
* Aggregate all LinkNode objects for a given url.
*/
public void reduce(Text key, Iterator<LinkNode> values,
- OutputCollector<Text, LinkNodes> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, LinkNodes> output, Reporter reporter)
+ throws IOException {
List<LinkNode> nodeList = new ArrayList<LinkNode>();
int numNodes = 0;
@@ -325,8 +314,7 @@ public class LinkDumper
if (numNodes < maxInlinks) {
nodeList.add(WritableUtils.clone(cur, conf));
numNodes++;
- }
- else {
+ } else {
break;
}
}
@@ -342,11 +330,10 @@ public class LinkDumper
}
/**
- * Runs the inverter and merger jobs of the LinkDumper tool to create the
- * url to inlink node database.
+ * Runs the inverter and merger jobs of the LinkDumper tool to create the url
+ * to inlink node database.
*/
- public void dumpLinks(Path webGraphDb)
- throws IOException {
+ public void dumpLinks(Path webGraphDb) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -362,7 +349,7 @@ public class LinkDumper
// run the inverter job
Path tempInverted = new Path(webGraphDb, "inverted-"
- + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf inverter = new NutchJob(conf);
inverter.setJobName("LinkDumper: inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
@@ -384,8 +371,7 @@ public class LinkDumper
LOG.info("LinkDumper: running inverter");
JobClient.runJob(inverter);
LOG.info("LinkDumper: finished inverter");
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -407,43 +393,41 @@ public class LinkDumper
LOG.info("LinkDumper: running merger");
JobClient.runJob(merger);
LOG.info("LinkDumper: finished merger");
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
fs.delete(tempInverted, true);
long end = System.currentTimeMillis();
- LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new LinkDumper(),
- args);
+ args);
System.exit(res);
}
/**
- * Runs the LinkDumper tool. This simply creates the database, to read the
+ * Runs the LinkDumper tool. This simply creates the database, to read the
* values the nested Reader tool must be used.
*/
- public int run(String[] args)
- throws Exception {
+ public int run(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the web graph database to use");
Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
options.addOption(webGraphDbOpts);
-
+
CommandLineParser parser = new GnuParser();
try {
@@ -457,8 +441,7 @@ public class LinkDumper
String webGraphDb = line.getOptionValue("webgraphdb");
dumpLinks(new Path(webGraphDb));
return 0;
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("LinkDumper: " + StringUtils.stringifyException(e));
return -2;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu Jan 29 05:38:59 2015
@@ -68,9 +68,7 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
-public class LinkRank
- extends Configured
- implements Tool {
+public class LinkRank extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(LinkRank.class);
private static final String NUM_NODES = "_num_nodes_";
@@ -79,14 +77,16 @@ public class LinkRank
* Runs the counter job. The counter job determines the number of links in the
* webgraph. This is used during analysis.
*
- * @param fs The job file system.
- * @param webGraphDb The web graph database to use.
+ * @param fs
+ * The job file system.
+ * @param webGraphDb
+ * The web graph database to use.
*
* @return The number of nodes in the web graph.
- * @throws IOException If an error occurs while running the counter job.
+ * @throws IOException
+ * If an error occurs while running the counter job.
*/
- private int runCounter(FileSystem fs, Path webGraphDb)
- throws IOException {
+ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {
// configure the counter job
Path numLinksPath = new Path(webGraphDb, NUM_NODES);
@@ -105,14 +105,14 @@ public class LinkRank
counter.setOutputValueClass(LongWritable.class);
counter.setNumReduceTasks(1);
counter.setOutputFormat(TextOutputFormat.class);
- counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+ counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+ false);
// run the counter job, outputs to a single reduce task and file
LOG.info("Starting link counter job");
try {
JobClient.runJob(counter);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -125,13 +125,13 @@ public class LinkRank
BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
String numLinksLine = buffer.readLine();
readLinks.close();
-
+
// check if there are links to process, if none, webgraph might be empty
if (numLinksLine == null || numLinksLine.length() == 0) {
fs.delete(numLinksPath, true);
throw new IOException("No links to process, is the webgraph empty?");
}
-
+
// delete temp file and convert and return the number of links as an int
LOG.info("Deleting numlinks temp file");
fs.delete(numLinksPath, true);
@@ -143,13 +143,15 @@ public class LinkRank
* Runs the initializer job. The initializer job sets up the nodes with a
* default starting score for link analysis.
*
- * @param nodeDb The node database to use.
- * @param output The job output directory.
+ * @param nodeDb
+ * The node database to use.
+ * @param output
+ * The job output directory.
*
- * @throws IOException If an error occurs while running the initializer job.
+ * @throws IOException
+ * If an error occurs while running the initializer job.
*/
- private void runInitializer(Path nodeDb, Path output)
- throws IOException {
+ private void runInitializer(Path nodeDb, Path output) throws IOException {
// configure the initializer
JobConf initializer = new NutchJob(getConf());
@@ -163,14 +165,14 @@ public class LinkRank
initializer.setOutputKeyClass(Text.class);
initializer.setOutputValueClass(Node.class);
initializer.setOutputFormat(MapFileOutputFormat.class);
- initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+ initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+ false);
// run the initializer
LOG.info("Starting initialization job");
try {
JobClient.runJob(initializer);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -186,15 +188,20 @@ public class LinkRank
* space requirements but it can be very useful is weeding out and eliminating
* link farms and other spam pages.
*
- * @param nodeDb The node database to use.
- * @param outlinkDb The outlink database to use.
- * @param loopDb The loop database to use if it exists.
- * @param output The output directory.
+ * @param nodeDb
+ * The node database to use.
+ * @param outlinkDb
+ * The outlink database to use.
+ * @param loopDb
+ * The loop database to use if it exists.
+ * @param output
+ * The output directory.
*
- * @throws IOException If an error occurs while running the inverter job.
+ * @throws IOException
+ * If an error occurs while running the inverter job.
*/
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
- throws IOException {
+ throws IOException {
// configure the inverter
JobConf inverter = new NutchJob(getConf());
@@ -215,14 +222,14 @@ public class LinkRank
inverter.setOutputKeyClass(Text.class);
inverter.setOutputValueClass(LinkDatum.class);
inverter.setOutputFormat(SequenceFileOutputFormat.class);
- inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+ inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+ false);
// run the inverter job
LOG.info("Starting inverter job");
try {
JobClient.runJob(inverter);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -236,23 +243,28 @@ public class LinkRank
* Typically the link analysis job is run a number of times to allow the link
* rank scores to converge.
*
- * @param nodeDb The node database from which we are getting previous link
- * rank scores.
- * @param inverted The inverted inlinks
- * @param output The link analysis output.
- * @param iteration The current iteration number.
- * @param numIterations The total number of link analysis iterations
+ * @param nodeDb
+ * The node database from which we are getting previous link rank
+ * scores.
+ * @param inverted
+ * The inverted inlinks
+ * @param output
+ * The link analysis output.
+ * @param iteration
+ * The current iteration number.
+ * @param numIterations
+ * The total number of link analysis iterations
*
- * @throws IOException If an error occurs during link analysis.
+ * @throws IOException
+ * If an error occurs during link analysis.
*/
private void runAnalysis(Path nodeDb, Path inverted, Path output,
- int iteration, int numIterations, float rankOne)
- throws IOException {
+ int iteration, int numIterations, float rankOne) throws IOException {
JobConf analyzer = new NutchJob(getConf());
analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
- + " of " + numIterations);
+ + " of " + numIterations);
FileInputFormat.addInputPath(analyzer, nodeDb);
FileInputFormat.addInputPath(analyzer, inverted);
FileOutputFormat.setOutputPath(analyzer, output);
@@ -265,13 +277,13 @@ public class LinkRank
analyzer.setOutputKeyClass(Text.class);
analyzer.setOutputValueClass(Node.class);
analyzer.setOutputFormat(MapFileOutputFormat.class);
- analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+ analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+ false);
LOG.info("Starting analysis job");
try {
JobClient.runJob(analyzer);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -283,9 +295,9 @@ public class LinkRank
* This is used to determine a rank one score for pages with zero inlinks but
* that contain outlinks.
*/
- private static class Counter
- implements Mapper<Text, Node, Text, LongWritable>,
- Reducer<Text, LongWritable, Text, LongWritable> {
+ private static class Counter implements
+ Mapper<Text, Node, Text, LongWritable>,
+ Reducer<Text, LongWritable, Text, LongWritable> {
private static Text numNodes = new Text(NUM_NODES);
private static LongWritable one = new LongWritable(1L);
@@ -297,8 +309,8 @@ public class LinkRank
* Outputs one for every node.
*/
public void map(Text key, Node value,
- OutputCollector<Text, LongWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, LongWritable> output, Reporter reporter)
+ throws IOException {
output.collect(numNodes, one);
}
@@ -306,8 +318,8 @@ public class LinkRank
* Totals the node number and outputs a single total value.
*/
public void reduce(Text key, Iterator<LongWritable> values,
- OutputCollector<Text, LongWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, LongWritable> output, Reporter reporter)
+ throws IOException {
long total = 0;
while (values.hasNext()) {
@@ -320,8 +332,7 @@ public class LinkRank
}
}
- private static class Initializer
- implements Mapper<Text, Node, Text, Node> {
+ private static class Initializer implements Mapper<Text, Node, Text, Node> {
private JobConf conf;
private float initialScore = 1.0f;
@@ -332,8 +343,7 @@ public class LinkRank
}
public void map(Text key, Node node, OutputCollector<Text, Node> output,
- Reporter reporter)
- throws IOException {
+ Reporter reporter) throws IOException {
String url = key.toString();
Node outNode = WritableUtils.clone(node, conf);
@@ -351,9 +361,9 @@ public class LinkRank
* WebGraph. The link analysis process consists of inverting, analyzing and
* scoring, in a loop for a given number of iterations.
*/
- private static class Inverter
- implements Mapper<Text, Writable, Text, ObjectWritable>,
- Reducer<Text, ObjectWritable, Text, LinkDatum> {
+ private static class Inverter implements
+ Mapper<Text, Writable, Text, ObjectWritable>,
+ Reducer<Text, ObjectWritable, Text, LinkDatum> {
private JobConf conf;
@@ -365,8 +375,8 @@ public class LinkRank
* Convert values to ObjectWritable
*/
public void map(Text key, Writable value,
- OutputCollector<Text, ObjectWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+ throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
@@ -379,8 +389,8 @@ public class LinkRank
* within the loopset.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
- OutputCollector<Text, LinkDatum> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, LinkDatum> output, Reporter reporter)
+ throws IOException {
String fromUrl = key.toString();
List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
@@ -392,23 +402,25 @@ public class LinkRank
ObjectWritable write = values.next();
Object obj = write.get();
if (obj instanceof Node) {
- node = (Node)obj;
- }
- else if (obj instanceof LinkDatum) {
- outlinks.add(WritableUtils.clone((LinkDatum)obj, conf));
- }
- else if (obj instanceof LoopSet) {
- loops = (LoopSet)obj;
+ node = (Node) obj;
+ } else if (obj instanceof LinkDatum) {
+ outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
+ } else if (obj instanceof LoopSet) {
+ loops = (LoopSet) obj;
}
}
- // Check for the possibility of a LoopSet object without Node and LinkDatum objects. This can happen
- // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL filters or normalizers) but
+ // Check for the possibility of a LoopSet object without Node and
+ // LinkDatum objects. This can happen
+ // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL
+ // filters or normalizers) but
// without an updated Loops database.
// See: https://issues.apache.org/jira/browse/NUTCH-1299
if (node == null && loops != null) {
// Nothing to do
- LOG.warn("LoopSet without Node object received for " + key.toString() + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph.");
+ LOG.warn("LoopSet without Node object received for "
+ + key.toString()
+ + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph.");
return;
}
@@ -430,7 +442,7 @@ public class LinkRank
// remove any url that is contained in the loopset
if (loopSet != null && loopSet.contains(toUrl)) {
LOG.debug(fromUrl + ": Skipping inverting inlink from loop "
- + toUrl);
+ + toUrl);
continue;
}
outlink.setUrl(fromUrl);
@@ -439,8 +451,8 @@ public class LinkRank
// collect the inverted outlink
output.collect(new Text(toUrl), outlink);
LOG.debug(toUrl + ": inverting inlink from " + fromUrl
- + " origscore: " + inlinkScore + " numOutlinks: " + numOutlinks
- + " inlinkscore: " + outlinkScore);
+ + " origscore: " + inlinkScore + " numOutlinks: " + numOutlinks
+ + " inlinkscore: " + outlinkScore);
}
}
}
@@ -452,9 +464,9 @@ public class LinkRank
/**
* Runs a single link analysis iteration.
*/
- private static class Analyzer
- implements Mapper<Text, Writable, Text, ObjectWritable>,
- Reducer<Text, ObjectWritable, Text, Node> {
+ private static class Analyzer implements
+ Mapper<Text, Writable, Text, ObjectWritable>,
+ Reducer<Text, ObjectWritable, Text, Node> {
private JobConf conf;
private float dampingFactor = 0.85f;
@@ -471,13 +483,13 @@ public class LinkRank
try {
this.conf = conf;
- this.dampingFactor = conf.getFloat("link.analyze.damping.factor", 0.85f);
+ this.dampingFactor = conf
+ .getFloat("link.analyze.damping.factor", 0.85f);
this.rankOne = conf.getFloat("link.analyze.rank.one", 0.0f);
this.itNum = conf.getInt("link.analyze.iteration", 0);
limitPages = conf.getBoolean("link.ignore.limit.page", true);
limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
throw new IllegalArgumentException(e);
}
@@ -487,8 +499,8 @@ public class LinkRank
* Convert values to ObjectWritable
*/
public void map(Text key, Writable value,
- OutputCollector<Text, ObjectWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+ throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(WritableUtils.clone(value, conf));
@@ -500,8 +512,8 @@ public class LinkRank
* stored in a temporary NodeDb which replaces the NodeDb of the WebGraph.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
- OutputCollector<Text, Node> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, Node> output, Reporter reporter)
+ throws IOException {
String url = key.toString();
Set<String> domains = new HashSet<String>();
@@ -517,11 +529,10 @@ public class LinkRank
ObjectWritable next = values.next();
Object value = next.get();
if (value instanceof Node) {
- node = (Node)value;
- }
- else if (value instanceof LinkDatum) {
+ node = (Node) value;
+ } else if (value instanceof LinkDatum) {
- LinkDatum linkDatum = (LinkDatum)value;
+ LinkDatum linkDatum = (LinkDatum) value;
float scoreFromInlink = linkDatum.getScore();
String inlinkUrl = linkDatum.getUrl();
String inLinkDomain = URLUtil.getDomainName(inlinkUrl);
@@ -529,9 +540,9 @@ public class LinkRank
// limit counting duplicate inlinks by pages or domains
if ((limitPages && pages.contains(inLinkPage))
- || (limitDomains && domains.contains(inLinkDomain))) {
+ || (limitDomains && domains.contains(inLinkDomain))) {
LOG.debug(url + ": ignoring " + scoreFromInlink + " from "
- + inlinkUrl + ", duplicate page or domain");
+ + inlinkUrl + ", duplicate page or domain");
continue;
}
@@ -541,16 +552,16 @@ public class LinkRank
domains.add(inLinkDomain);
pages.add(inLinkPage);
LOG.debug(url + ": adding " + scoreFromInlink + " from " + inlinkUrl
- + ", total: " + totalInlinkScore);
+ + ", total: " + totalInlinkScore);
}
}
// calculate linkRank score formula
float linkRankScore = (1 - this.dampingFactor)
- + (this.dampingFactor * totalInlinkScore);
+ + (this.dampingFactor * totalInlinkScore);
LOG.debug(url + ": score: " + linkRankScore + " num inlinks: "
- + numInlinks + " iteration: " + itNum);
+ + numInlinks + " iteration: " + itNum);
// store the score in a temporary NodeDb
Node outNode = WritableUtils.clone(node, conf);
@@ -558,8 +569,7 @@ public class LinkRank
output.collect(key, outNode);
}
- public void close()
- throws IOException {
+ public void close() throws IOException {
}
}
@@ -586,12 +596,13 @@ public class LinkRank
* by default 10. And finally replaces the NodeDb in the WebGraph with the
* link rank output.
*
- * @param webGraphDb The WebGraph to run link analysis on.
+ * @param webGraphDb
+ * The WebGraph to run link analysis on.
*
- * @throws IOException If an error occurs during link analysis.
+ * @throws IOException
+ * If an error occurs during link analysis.
*/
- public void analyze(Path webGraphDb)
- throws IOException {
+ public void analyze(Path webGraphDb) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -621,7 +632,7 @@ public class LinkRank
// initialze all urls with a default score
int numLinks = runCounter(fs, webGraphDb);
runInitializer(wgNodeDb, nodeDb);
- float rankOneScore = (1f / (float)numLinks);
+ float rankOneScore = (1f / (float) numLinks);
if (LOG.isInfoEnabled()) {
LOG.info("Analysis: Number of links: " + numLinks);
@@ -634,9 +645,10 @@ public class LinkRank
for (int i = 0; i < numIterations; i++) {
// the input to inverting is always the previous output from analysis
- LOG.info("Analysis: Starting iteration " + (i + 1) + " of " + numIterations);
+ LOG.info("Analysis: Starting iteration " + (i + 1) + " of "
+ + numIterations);
Path tempRank = new Path(linkRank + "-"
- + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
fs.mkdirs(tempRank);
Path tempInverted = new Path(tempRank, "inverted");
Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR);
@@ -644,13 +656,13 @@ public class LinkRank
// run invert and analysis
runInverter(nodeDb, wgOutlinkDb, loopDb, tempInverted);
runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations,
- rankOneScore);
+ rankOneScore);
// replace the temporary NodeDb with the output from analysis
LOG.info("Analysis: Installing new link scores");
FSUtils.replace(fs, linkRank, tempRank, true);
LOG.info("Analysis: finished iteration " + (i + 1) + " of "
- + numIterations);
+ + numIterations);
}
// replace the NodeDb in the WebGraph with the final output of analysis
@@ -660,11 +672,11 @@ public class LinkRank
// remove the temporary link rank folder
fs.delete(linkRank, true);
long end = System.currentTimeMillis();
- LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new LinkRank(), args);
System.exit(res);
}
@@ -672,15 +684,14 @@ public class LinkRank
/**
* Runs the LinkRank tool.
*/
- public int run(String[] args)
- throws Exception {
+ public int run(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the web graph db to use");
@@ -701,8 +712,7 @@ public class LinkRank
analyze(new Path(webGraphDb));
return 0;
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("LinkAnalysis: " + StringUtils.stringifyException(e));
return -2;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Thu Jan 29 05:38:59 2015
@@ -44,35 +44,38 @@ public class LoopReader extends Configur
private FileSystem fs;
private MapFile.Reader[] loopReaders;
-
- public LoopReader() { }
-
+
+ public LoopReader() {
+ }
+
public LoopReader(Configuration conf) {
super(conf);
}
/**
- * Prints loopset for a single url. The loopset information will show any
+ * Prints loopset for a single url. The loopset information will show any
* outlink url the eventually forms a link cycle.
*
- * @param webGraphDb The WebGraph to check for loops
- * @param url The url to check.
+ * @param webGraphDb
+ * The WebGraph to check for loops
+ * @param url
+ * The url to check.
*
- * @throws IOException If an error occurs while printing loopset information.
+ * @throws IOException
+ * If an error occurs while printing loopset information.
*/
- public void dumpUrl(Path webGraphDb, String url)
- throws IOException {
+ public void dumpUrl(Path webGraphDb, String url) throws IOException {
// open the readers
fs = FileSystem.get(getConf());
loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
- Loops.LOOPS_DIR), getConf());
+ Loops.LOOPS_DIR), getConf());
// get the loopset for a given url, if any
Text key = new Text(url);
LoopSet loop = new LoopSet();
MapFileOutputFormat.getEntry(loopReaders,
- new HashPartitioner<Text, LoopSet>(), key, loop);
+ new HashPartitioner<Text, LoopSet>(), key, loop);
// print out each loop url in the set
System.out.println(url + ":");
@@ -85,24 +88,23 @@ public class LoopReader extends Configur
}
/**
- * Runs the LoopReader tool. For this tool to work the loops job must have
+ * Runs the LoopReader tool. For this tool to work the loops job must have
* already been run on the corresponding WebGraph.
*/
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the webgraphdb to use");
Option webGraphOpts = OptionBuilder.create("webgraphdb");
options.addOption(webGraphOpts);
-
+
OptionBuilder.withArgName("url");
OptionBuilder.hasOptionalArg();
OptionBuilder.withDescription("the url to dump");
@@ -114,7 +116,7 @@ public class LoopReader extends Configur
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")
- || !line.hasOption("url")) {
+ || !line.hasOption("url")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("WebGraphReader", options);
return;
@@ -125,8 +127,7 @@ public class LoopReader extends Configur
LoopReader reader = new LoopReader(NutchConfiguration.create());
reader.dumpUrl(new Path(webGraphDb), url);
return;
- }
- catch (Exception e) {
+ } catch (Exception e) {
e.printStackTrace();
return;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Thu Jan 29 05:38:59 2015
@@ -76,9 +76,7 @@ import org.apache.nutch.util.TimingUtil;
* rather small. Because of this the Loops job is optional and if it doesn't
* exist then it won't be factored into the LinkRank program.
*/
-public class Loops
- extends Configured
- implements Tool {
+public class Loops extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(Loops.class);
public static final String LOOPS_DIR = "loops";
@@ -87,8 +85,7 @@ public class Loops
/**
* A link path or route looking to identify a link cycle.
*/
- public static class Route
- implements Writable {
+ public static class Route implements Writable {
private String outlinkUrl = null;
private String lookingFor = null;
@@ -122,16 +119,14 @@ public class Loops
this.found = found;
}
- public void readFields(DataInput in)
- throws IOException {
+ public void readFields(DataInput in) throws IOException {
outlinkUrl = Text.readString(in);
lookingFor = Text.readString(in);
found = in.readBoolean();
}
- public void write(DataOutput out)
- throws IOException {
+ public void write(DataOutput out) throws IOException {
Text.writeString(out, outlinkUrl);
Text.writeString(out, lookingFor);
out.writeBoolean(found);
@@ -141,8 +136,7 @@ public class Loops
/**
* A set of loops.
*/
- public static class LoopSet
- implements Writable {
+ public static class LoopSet implements Writable {
private Set<String> loopSet = new HashSet<String>();
@@ -158,8 +152,7 @@ public class Loops
this.loopSet = loopSet;
}
- public void readFields(DataInput in)
- throws IOException {
+ public void readFields(DataInput in) throws IOException {
int numNodes = in.readInt();
loopSet = new HashSet<String>();
@@ -169,8 +162,7 @@ public class Loops
}
}
- public void write(DataOutput out)
- throws IOException {
+ public void write(DataOutput out) throws IOException {
int numNodes = (loopSet != null ? loopSet.size() : 0);
out.writeInt(numNodes);
@@ -191,10 +183,9 @@ public class Loops
/**
* Initializes the Loop routes.
*/
- public static class Initializer
- extends Configured
- implements Mapper<Text, Writable, Text, ObjectWritable>,
- Reducer<Text, ObjectWritable, Text, Route> {
+ public static class Initializer extends Configured implements
+ Mapper<Text, Writable, Text, ObjectWritable>,
+ Reducer<Text, ObjectWritable, Text, Route> {
private JobConf conf;
@@ -222,8 +213,8 @@ public class Loops
* Wraps values in ObjectWritable.
*/
public void map(Text key, Writable value,
- OutputCollector<Text, ObjectWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+ throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
@@ -236,8 +227,8 @@ public class Loops
* the Looper job.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
- OutputCollector<Text, Route> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, Route> output, Reporter reporter)
+ throws IOException {
String url = key.toString();
Node node = null;
@@ -248,10 +239,9 @@ public class Loops
ObjectWritable objWrite = values.next();
Object obj = objWrite.get();
if (obj instanceof LinkDatum) {
- outlinkList.add((LinkDatum)obj);
- }
- else if (obj instanceof Node) {
- node = (Node)obj;
+ outlinkList.add((LinkDatum) obj);
+ } else if (obj instanceof Node) {
+ node = (Node) obj;
}
}
@@ -282,10 +272,9 @@ public class Loops
* Follows a route path looking for the start url of the route. If the start
* url is found then the route is a cyclical path.
*/
- public static class Looper
- extends Configured
- implements Mapper<Text, Writable, Text, ObjectWritable>,
- Reducer<Text, ObjectWritable, Text, Route> {
+ public static class Looper extends Configured implements
+ Mapper<Text, Writable, Text, ObjectWritable>,
+ Reducer<Text, ObjectWritable, Text, Route> {
private JobConf conf;
private boolean last = false;
@@ -315,15 +304,14 @@ public class Loops
* Wrap values in ObjectWritable.
*/
public void map(Text key, Writable value,
- OutputCollector<Text, ObjectWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+ throws IOException {
ObjectWritable objWrite = new ObjectWritable();
Writable cloned = null;
if (value instanceof LinkDatum) {
- cloned = new Text(((LinkDatum)value).getUrl());
- }
- else {
+ cloned = new Text(((LinkDatum) value).getUrl());
+ } else {
cloned = WritableUtils.clone(value, conf);
}
objWrite.set(cloned);
@@ -336,8 +324,8 @@ public class Loops
* passes.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
- OutputCollector<Text, Route> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, Route> output, Reporter reporter)
+ throws IOException {
List<Route> routeList = new ArrayList<Route>();
Set<String> outlinkUrls = new LinkedHashSet<String>();
@@ -348,10 +336,9 @@ public class Loops
ObjectWritable next = values.next();
Object value = next.get();
if (value instanceof Route) {
- routeList.add(WritableUtils.clone((Route)value, conf));
- }
- else if (value instanceof Text) {
- String outlinkUrl = ((Text)value).toString();
+ routeList.add(WritableUtils.clone((Route) value, conf));
+ } else if (value instanceof Text) {
+ String outlinkUrl = ((Text) value).toString();
if (!outlinkUrls.contains(outlinkUrl)) {
outlinkUrls.add(outlinkUrl);
}
@@ -375,16 +362,14 @@ public class Loops
routeIt.remove();
if (route.isFound()) {
output.collect(key, route);
- }
- else {
+ } else {
// if the route start url is found, set route to found and collect
String lookingFor = route.getLookingFor();
if (outlinkUrls.contains(lookingFor)) {
route.setFound(true);
output.collect(key, route);
- }
- else if (!last) {
+ } else if (!last) {
// setup for next pass through the loop
for (String outlink : outlinkUrls) {
@@ -402,10 +387,8 @@ public class Loops
/**
* Finishes the Loops job by aggregating and collecting and found routes.
*/
- public static class Finalizer
- extends Configured
- implements Mapper<Text, Route, Text, Route>,
- Reducer<Text, Route, Text, LoopSet> {
+ public static class Finalizer extends Configured implements
+ Mapper<Text, Route, Text, Route>, Reducer<Text, Route, Text, LoopSet> {
private JobConf conf;
@@ -433,8 +416,7 @@ public class Loops
* Maps out and found routes, those will be the link cycles.
*/
public void map(Text key, Route value, OutputCollector<Text, Route> output,
- Reporter reporter)
- throws IOException {
+ Reporter reporter) throws IOException {
if (value.isFound()) {
String lookingFor = value.getLookingFor();
@@ -443,12 +425,12 @@ public class Loops
}
/**
- * Aggregates all found routes for a given start url into a loopset and
+ * Aggregates all found routes for a given start url into a loopset and
* collects the loopset.
*/
public void reduce(Text key, Iterator<Route> values,
- OutputCollector<Text, LoopSet> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, LoopSet> output, Reporter reporter)
+ throws IOException {
LoopSet loops = new LoopSet();
while (values.hasNext()) {
@@ -465,8 +447,7 @@ public class Loops
/**
* Runs the various loop jobs.
*/
- public void findLoops(Path webGraphDb)
- throws IOException {
+ public void findLoops(Path webGraphDb) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -481,7 +462,7 @@ public class Loops
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Path routes = new Path(webGraphDb, ROUTES_DIR);
Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-"
- + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
// run the initializer
JobConf init = new NutchJob(conf);
@@ -504,8 +485,7 @@ public class Loops
LOG.info("Loops: installing initializer " + routes);
FSUtils.replace(fs, routes, tempRoute, true);
LOG.info("Loops: finished initializer");
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -536,8 +516,7 @@ public class Loops
LOG.info("Loops: installing looper " + routes);
FSUtils.replace(fs, routes, tempRoute, true);
LOG.info("Loops: finished looper");
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
@@ -561,17 +540,16 @@ public class Loops
LOG.info("Loops: starting finalizer");
JobClient.runJob(finalizer);
LOG.info("Loops: finished finalizer");
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
long end = System.currentTimeMillis();
- LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new Loops(), args);
System.exit(res);
}
@@ -579,15 +557,14 @@ public class Loops
/**
* Runs the Loops tool.
*/
- public int run(String[] args)
- throws Exception {
+ public int run(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the web graph database to use");
@@ -607,8 +584,7 @@ public class Loops
String webGraphDb = line.getOptionValue("webgraphdb");
findLoops(new Path(webGraphDb));
return 0;
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("Loops: " + StringUtils.stringifyException(e));
return -2;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java Thu Jan 29 05:38:59 2015
@@ -25,12 +25,11 @@ import org.apache.nutch.metadata.Metadat
/**
* A class which holds the number of inlinks and outlinks for a given url along
- * with an inlink score from a link analysis program and any metadata.
+ * with an inlink score from a link analysis program and any metadata.
*
* The Node is the core unit of the NodeDb in the WebGraph.
*/
-public class Node
- implements Writable {
+public class Node implements Writable {
private int numInlinks = 0;
private int numOutlinks = 0;
@@ -77,8 +76,7 @@ public class Node
this.metadata = metadata;
}
- public void readFields(DataInput in)
- throws IOException {
+ public void readFields(DataInput in) throws IOException {
numInlinks = in.readInt();
numOutlinks = in.readInt();
@@ -87,8 +85,7 @@ public class Node
metadata.readFields(in);
}
- public void write(DataOutput out)
- throws IOException {
+ public void write(DataOutput out) throws IOException {
out.writeInt(numInlinks);
out.writeInt(numOutlinks);
@@ -98,8 +95,8 @@ public class Node
public String toString() {
return "num inlinks: " + numInlinks + ", num outlinks: " + numOutlinks
- + ", inlink score: " + inlinkScore + ", outlink score: "
- + getOutlinkScore() + ", metadata: " + metadata.toString();
+ + ", inlink score: " + inlinkScore + ", outlink score: "
+ + getOutlinkScore() + ", metadata: " + metadata.toString();
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java Thu Jan 29 05:38:59 2015
@@ -63,26 +63,20 @@ import org.apache.nutch.util.URLUtil;
* have been run. For link analysis score a program such as LinkRank will need
* to have been run which updates the NodeDb of the WebGraph.
*/
-public class NodeDumper
- extends Configured
- implements Tool {
+public class NodeDumper extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(NodeDumper.class);
private static enum DumpType {
- INLINKS,
- OUTLINKS,
- SCORES
+ INLINKS, OUTLINKS, SCORES
}
private static enum AggrType {
- SUM,
- MAX
+ SUM, MAX
}
private static enum NameType {
- HOST,
- DOMAIN
+ HOST, DOMAIN
}
/**
@@ -90,10 +84,9 @@ public class NodeDumper
* on the command line, the top urls could be for number of inlinks, for
* number of outlinks, or for link analysis score.
*/
- public static class Sorter
- extends Configured
- implements Mapper<Text, Node, FloatWritable, Text>,
- Reducer<FloatWritable, Text, Text, FloatWritable> {
+ public static class Sorter extends Configured implements
+ Mapper<Text, Node, FloatWritable, Text>,
+ Reducer<FloatWritable, Text, Text, FloatWritable> {
private JobConf conf;
private boolean inlinks = false;
@@ -121,17 +114,15 @@ public class NodeDumper
* score.
*/
public void map(Text key, Node node,
- OutputCollector<FloatWritable, Text> output, Reporter reporter)
- throws IOException {
+ OutputCollector<FloatWritable, Text> output, Reporter reporter)
+ throws IOException {
float number = 0;
if (inlinks) {
number = node.getNumInlinks();
- }
- else if (outlinks) {
+ } else if (outlinks) {
number = node.getNumOutlinks();
- }
- else {
+ } else {
number = node.getInlinkScore();
}
@@ -143,8 +134,8 @@ public class NodeDumper
* Flips and collects the url and numeric sort value.
*/
public void reduce(FloatWritable key, Iterator<Text> values,
- OutputCollector<Text, FloatWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
// take the negative of the negative to get original value, sometimes 0
// value are a little weird
@@ -162,14 +153,13 @@ public class NodeDumper
}
/**
- * Outputs the hosts or domains with an associated value. This value consists of either
- * the number of inlinks, the number of outlinks or the score. The computed value is then
- * either the sum of all parts or the top value.
+ * Outputs the hosts or domains with an associated value. This value consists
+ * of either the number of inlinks, the number of outlinks or the score. The
+ * computed value is then either the sum of all parts or the top value.
*/
- public static class Dumper
- extends Configured
- implements Mapper<Text, Node, Text, FloatWritable>,
- Reducer<Text, FloatWritable, Text, FloatWritable> {
+ public static class Dumper extends Configured implements
+ Mapper<Text, Node, Text, FloatWritable>,
+ Reducer<Text, FloatWritable, Text, FloatWritable> {
private JobConf conf;
private boolean inlinks = false;
@@ -197,21 +187,19 @@ public class NodeDumper
}
/**
- * Outputs the host or domain as key for this record and numInlinks, numOutlinks
- * or score as the value.
+ * Outputs the host or domain as key for this record and numInlinks,
+ * numOutlinks or score as the value.
*/
public void map(Text key, Node node,
- OutputCollector<Text, FloatWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
float number = 0;
if (inlinks) {
number = node.getNumInlinks();
- }
- else if (outlinks) {
+ } else if (outlinks) {
number = node.getNumOutlinks();
- }
- else {
+ } else {
number = node.getInlinkScore();
}
@@ -228,8 +216,8 @@ public class NodeDumper
* Outputs either the sum or the top value for this record.
*/
public void reduce(Text key, Iterator<FloatWritable> values,
- OutputCollector<Text, FloatWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
long numCollected = 0;
float sumOrMax = 0;
@@ -256,16 +244,19 @@ public class NodeDumper
/**
* Runs the process to dump the top urls out to a text file.
- *
- * @param webGraphDb The WebGraph from which to pull values.
- *
+ *
+ * @param webGraphDb
+ * The WebGraph from which to pull values.
+ *
* @param topN
* @param output
- *
- * @throws IOException If an error occurs while dumping the top values.
+ *
+ * @throws IOException
+ * If an error occurs while dumping the top values.
*/
- public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
- throws Exception {
+ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
+ boolean asEff, NameType nameType, AggrType aggrType,
+ boolean asSequenceFile) throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -320,77 +311,76 @@ public class NodeDumper
try {
LOG.info("NodeDumper: running");
JobClient.runJob(dumper);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
long end = System.currentTimeMillis();
- LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new NodeDumper(),
- args);
+ args);
System.exit(res);
}
/**
* Runs the node dumper tool.
*/
- public int run(String[] args)
- throws Exception {
+ public int run(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the web graph database to use");
Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
options.addOption(webGraphDbOpts);
-
+
OptionBuilder.withArgName("inlinks");
OptionBuilder.withDescription("show highest inlinks");
Option inlinkOpts = OptionBuilder.create("inlinks");
options.addOption(inlinkOpts);
-
+
OptionBuilder.withArgName("outlinks");
OptionBuilder.withDescription("show highest outlinks");
Option outlinkOpts = OptionBuilder.create("outlinks");
options.addOption(outlinkOpts);
-
+
OptionBuilder.withArgName("scores");
OptionBuilder.withDescription("show highest scores");
Option scoreOpts = OptionBuilder.create("scores");
options.addOption(scoreOpts);
-
+
OptionBuilder.withArgName("topn");
OptionBuilder.hasOptionalArg();
OptionBuilder.withDescription("show topN scores");
Option topNOpts = OptionBuilder.create("topn");
options.addOption(topNOpts);
-
+
OptionBuilder.withArgName("output");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the output directory to use");
Option outputOpts = OptionBuilder.create("output");
options.addOption(outputOpts);
-
+
OptionBuilder.withArgName("asEff");
- OptionBuilder.withDescription("Solr ExternalFileField compatible output format");
+ OptionBuilder
+ .withDescription("Solr ExternalFileField compatible output format");
Option effOpts = OptionBuilder.create("asEff");
options.addOption(effOpts);
-
+
OptionBuilder.hasArgs(2);
OptionBuilder.withDescription("group <host|domain> <sum|max>");
Option groupOpts = OptionBuilder.create("group");
options.addOption(groupOpts);
-
+
OptionBuilder.withArgName("asSequenceFile");
OptionBuilder.withDescription("whether to output as a sequencefile");
Option sequenceFileOpts = OptionBuilder.create("asSequenceFile");
@@ -410,32 +400,32 @@ public class NodeDumper
boolean inlinks = line.hasOption("inlinks");
boolean outlinks = line.hasOption("outlinks");
- long topN = (line.hasOption("topn")
- ? Long.parseLong(line.getOptionValue("topn")) : Long.MAX_VALUE);
+ long topN = (line.hasOption("topn") ? Long.parseLong(line
+ .getOptionValue("topn")) : Long.MAX_VALUE);
// get the correct dump type
String output = line.getOptionValue("output");
- DumpType type = (inlinks ? DumpType.INLINKS : outlinks
- ? DumpType.OUTLINKS : DumpType.SCORES);
+ DumpType type = (inlinks ? DumpType.INLINKS
+ : outlinks ? DumpType.OUTLINKS : DumpType.SCORES);
NameType nameType = null;
AggrType aggrType = null;
String[] group = line.getOptionValues("group");
if (group != null && group.length == 2) {
- nameType = (group[0].equals("host") ? NameType.HOST : group[0].equals("domain")
- ? NameType.DOMAIN : null);
- aggrType = (group[1].equals("sum") ? AggrType.SUM : group[1].equals("sum")
- ? AggrType.MAX : null);
+ nameType = (group[0].equals("host") ? NameType.HOST : group[0]
+ .equals("domain") ? NameType.DOMAIN : null);
+ aggrType = (group[1].equals("sum") ? AggrType.SUM : group[1]
+ .equals("sum") ? AggrType.MAX : null);
}
// Use ExternalFileField?
boolean asEff = line.hasOption("asEff");
boolean asSequenceFile = line.hasOption("asSequenceFile");
- dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff, nameType, aggrType, asSequenceFile);
+ dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff,
+ nameType, aggrType, asSequenceFile);
return 0;
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("NodeDumper: " + StringUtils.stringifyException(e));
return -2;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Thu Jan 29 05:38:59 2015
@@ -37,7 +37,7 @@ import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
/**
- * Reads and prints to system out information for a single node from the NodeDb
+ * Reads and prints to system out information for a single node from the NodeDb
* in the WebGraph.
*/
public class NodeReader extends Configured {
@@ -46,33 +46,35 @@ public class NodeReader extends Configur
private MapFile.Reader[] nodeReaders;
public NodeReader() {
-
+
}
-
+
public NodeReader(Configuration conf) {
super(conf);
}
-
+
/**
* Prints the content of the Node represented by the url to system out.
*
- * @param webGraphDb The webgraph from which to get the node.
- * @param url The url of the node.
+ * @param webGraphDb
+ * The webgraph from which to get the node.
+ * @param url
+ * The url of the node.
*
- * @throws IOException If an error occurs while getting the node.
+ * @throws IOException
+ * If an error occurs while getting the node.
*/
- public void dumpUrl(Path webGraphDb, String url)
- throws IOException {
+ public void dumpUrl(Path webGraphDb, String url) throws IOException {
fs = FileSystem.get(getConf());
nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
- WebGraph.NODE_DIR), getConf());
+ WebGraph.NODE_DIR), getConf());
// open the readers, get the node, print out the info, and close the readers
Text key = new Text(url);
Node node = new Node();
MapFileOutputFormat.getEntry(nodeReaders,
- new HashPartitioner<Text, Node>(), key, node);
+ new HashPartitioner<Text, Node>(), key, node);
System.out.println(url + ":");
System.out.println(" inlink score: " + node.getInlinkScore());
System.out.println(" outlink score: " + node.getOutlinkScore());
@@ -82,25 +84,24 @@ public class NodeReader extends Configur
}
/**
- * Runs the NodeReader tool. The command line arguments must contain a
- * webgraphdb path and a url. The url must match the normalized url that is
+ * Runs the NodeReader tool. The command line arguments must contain a
+ * webgraphdb path and a url. The url must match the normalized url that is
* contained in the NodeDb of the WebGraph.
*/
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the webgraphdb to use");
Option webGraphOpts = OptionBuilder.create("webgraphdb");
options.addOption(webGraphOpts);
-
+
OptionBuilder.withArgName("url");
OptionBuilder.hasOptionalArg();
OptionBuilder.withDescription("the url to dump");
@@ -113,7 +114,7 @@ public class NodeReader extends Configur
// command line must take a webgraphdb and a url
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")
- || !line.hasOption("url")) {
+ || !line.hasOption("url")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("WebGraphReader", options);
return;
@@ -124,10 +125,9 @@ public class NodeReader extends Configur
String url = line.getOptionValue("url");
NodeReader reader = new NodeReader(NutchConfiguration.create());
reader.dumpUrl(new Path(webGraphDb), url);
-
+
return;
- }
- catch (Exception e) {
+ } catch (Exception e) {
e.printStackTrace();
return;
}
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java Thu Jan 29 05:38:59 2015
@@ -58,13 +58,12 @@ import org.apache.nutch.util.TimingUtil;
/**
* Updates the score from the WebGraph node database into the crawl database.
- * Any score that is not in the node database is set to the clear score in the
+ * Any score that is not in the node database is set to the clear score in the
* crawl database.
*/
-public class ScoreUpdater
- extends Configured
- implements Tool, Mapper<Text, Writable, Text, ObjectWritable>,
- Reducer<Text, ObjectWritable, Text, CrawlDatum> {
+public class ScoreUpdater extends Configured implements Tool,
+ Mapper<Text, Writable, Text, ObjectWritable>,
+ Reducer<Text, ObjectWritable, Text, CrawlDatum> {
public static final Logger LOG = LoggerFactory.getLogger(ScoreUpdater.class);
@@ -80,8 +79,8 @@ public class ScoreUpdater
* Changes input into ObjectWritables.
*/
public void map(Text key, Writable value,
- OutputCollector<Text, ObjectWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+ throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
@@ -93,8 +92,8 @@ public class ScoreUpdater
* with a cleared score.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
- OutputCollector<Text, CrawlDatum> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
String url = key.toString();
Node node = null;
@@ -106,34 +105,31 @@ public class ScoreUpdater
ObjectWritable next = values.next();
Object value = next.get();
if (value instanceof Node) {
- node = (Node)value;
- }
- else if (value instanceof CrawlDatum) {
- datum = (CrawlDatum)value;
+ node = (Node) value;
+ } else if (value instanceof CrawlDatum) {
+ datum = (CrawlDatum) value;
}
}
- // datum should never be null, could happen if somehow the url was
+ // datum should never be null, could happen if somehow the url was
// normalized or changed after being pulled from the crawldb
if (datum != null) {
if (node != null) {
-
+
// set the inlink score in the nodedb
float inlinkScore = node.getInlinkScore();
datum.setScore(inlinkScore);
LOG.debug(url + ": setting to score " + inlinkScore);
- }
- else {
-
+ } else {
+
// clear out the score in the crawldb
datum.setScore(clearScore);
LOG.debug(url + ": setting to clear score of " + clearScore);
}
output.collect(key, datum);
- }
- else {
+ } else {
LOG.debug(url + ": no datum");
}
}
@@ -142,16 +138,18 @@ public class ScoreUpdater
}
/**
- * Updates the inlink score in the web graph node databsae into the crawl
+ * Updates the inlink score in the web graph node databsae into the crawl
* database.
*
- * @param crawlDb The crawl database to update
- * @param webGraphDb The webgraph database to use.
+ * @param crawlDb
+ * The crawl database to update
+ * @param webGraphDb
+ * The webgraph database to use.
*
- * @throws IOException If an error occurs while updating the scores.
+ * @throws IOException
+ * If an error occurs while updating the scores.
*/
- public void update(Path crawlDb, Path webGraphDb)
- throws IOException {
+ public void update(Path crawlDb, Path webGraphDb) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -164,8 +162,8 @@ public class ScoreUpdater
LOG.info("Running crawldb update " + crawlDb);
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
- Path newCrawlDb = new Path(crawlDb,
- Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
+ .nextInt(Integer.MAX_VALUE)));
// run the updater job outputting to the temp crawl database
JobConf updater = new NutchJob(conf);
@@ -184,10 +182,9 @@ public class ScoreUpdater
try {
JobClient.runJob(updater);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
-
+
// remove the temp crawldb on error
if (fs.exists(newCrawlDb)) {
fs.delete(newCrawlDb, true);
@@ -200,34 +197,33 @@ public class ScoreUpdater
CrawlDb.install(updater, crawlDb);
long end = System.currentTimeMillis();
- LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static void main(String[] args)
- throws Exception {
+ public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new ScoreUpdater(),
- args);
+ args);
System.exit(res);
}
/**
* Runs the ScoreUpdater tool.
*/
- public int run(String[] args)
- throws Exception {
+ public int run(String[] args) throws Exception {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("crawldb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the crawldb to use");
Option crawlDbOpts = OptionBuilder.create("crawldb");
options.addOption(crawlDbOpts);
-
+
OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the webgraphdb to use");
@@ -239,7 +235,7 @@ public class ScoreUpdater
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")
- || !line.hasOption("crawldb")) {
+ || !line.hasOption("crawldb")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("ScoreUpdater", options);
return -1;
@@ -249,8 +245,7 @@ public class ScoreUpdater
String webGraphDb = line.getOptionValue("webgraphdb");
update(new Path(crawlDb), new Path(webGraphDb));
return 0;
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("ScoreUpdater: " + StringUtils.stringifyException(e));
return -1;
}