You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/01/31 15:17:28 UTC
svn commit: r1238590 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
Author: markus
Date: Tue Jan 31 14:17:27 2012
New Revision: 1238590
URL: http://svn.apache.org/viewvc?rev=1238590&view=rev
Log:
NUTCH-1256 WebGraph to dump host + score. Most if not all WebGraph options have been added to nutch-default as well.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1238590&r1=1238589&r2=1238590&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jan 31 14:17:27 2012
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1256 WebGraph to dump host + score (markus)
+
* NUTCH-1260 Fetcher should log fetching of redirects (Sebastian Nagel via markus)
* NUTCH-1255 Change ivy.xml of all plugins to remove "nutch.root" property (ferdy)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1238590&r1=1238589&r2=1238590&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Jan 31 14:17:27 2012
@@ -1180,6 +1180,68 @@
</description>
</property>
+<!-- linkrank scoring properties -->
+
+<property>
+ <name>link.ignore.internal.host</name>
+ <value>true</value>
+ <description>Ignore outlinks to the same hostname.</description>
+</property>
+
+<property>
+ <name>link.ignore.internal.domain</name>
+ <value>true</value>
+ <description>Ignore outlinks to the same domain.</description>
+</property>
+
+<property>
+ <name>link.ignore.limit.page</name>
+ <value>true</value>
+ <description>Limit to only a single outlink to the same page.</description>
+</property>
+
+<property>
+ <name>link.ignore.limit.domain</name>
+ <value>true</value>
+ <description>Limit to only a single outlink to the same domain.</description>
+</property>
+
+<property>
+ <name>link.analyze.num.iterations</name>
+ <value>10</value>
+ <description>The number of LinkRank iterations to run.</description>
+</property>
+
+<property>
+ <name>link.analyze.initial.score</name>
+ <value>1.0f</value>
+ <description>The initial score.</description>
+</property>
+
+<property>
+ <name>link.analyze.damping.factor</name>
+ <value>0.85f</value>
+ <description>The damping factor.</description>
+</property>
+
+<property>
+ <name>link.delete.gone</name>
+ <value>false</value>
+ <description>Whether to delete gone pages from the web graph.</description>
+</property>
+
+<property>
+ <name>link.loops.depth</name>
+ <value>2</value>
+ <description>The depth for the loops algorithm.</description>
+</property>
+
+<property>
+ <name>link.score.updater.clear.score</name>
+ <value>0.0f</value>
+ <description>The default score for URL's that are not in the web graph.</description>
+</property>
+
<property>
<name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name>
<value>false</value>
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java?rev=1238590&r1=1238589&r2=1238590&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java Tue Jan 31 14:17:27 2012
@@ -44,6 +44,7 @@ import org.apache.hadoop.mapred.OutputCo
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
@@ -51,6 +52,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
/**
* A tools that dumps out the top urls by number of inlinks, number of outlinks,
@@ -73,6 +75,16 @@ public class NodeDumper
SCORES
}
+ private static enum AggrType {
+ SUM,
+ MAX
+ }
+
+ private static enum NameType {
+ HOST,
+ DOMAIN
+ }
+
/**
* Outputs the top urls sorted in descending order. Depending on the flag set
* on the command line, the top urls could be for number of inlinks, for
@@ -142,7 +154,7 @@ public class NodeDumper
// collect all values, this time with the url as key
while (values.hasNext() && (numCollected < topn)) {
- Text url = (Text)WritableUtils.clone(values.next(), conf);
+ Text url = WritableUtils.clone(values.next(), conf);
output.collect(url, number);
numCollected++;
}
@@ -150,17 +162,110 @@ public class NodeDumper
}
/**
+ * Outputs the hosts or domains with an associated value. This value consists of either
+ * the number of inlinks, the number of outlinks or the score. The computed value is then
+ * either the sum of all parts or the top value.
+ */
+ public static class Dumper
+ extends Configured
+ implements Mapper<Text, Node, Text, FloatWritable>,
+ Reducer<Text, FloatWritable, Text, FloatWritable> {
+
+ private JobConf conf;
+ private boolean inlinks = false;
+ private boolean outlinks = false;
+ private boolean scores = false;
+ private long topn = Long.MAX_VALUE;
+ private boolean host = false;
+ private boolean domain = false;
+ private boolean sum = false;
+ private boolean max = false;
+
+ public void configure(JobConf conf) {
+ this.conf = conf;
+ this.inlinks = conf.getBoolean("inlinks", false);
+ this.outlinks = conf.getBoolean("outlinks", false);
+ this.scores = conf.getBoolean("scores", true);
+ this.topn = conf.getLong("topn", Long.MAX_VALUE);
+ this.host = conf.getBoolean("host", false);
+ this.domain = conf.getBoolean("domain", false);
+ this.sum = conf.getBoolean("sum", false);
+ this.max = conf.getBoolean("max", false);
+ }
+
+ public void close() {
+ }
+
+ /**
+ * Outputs the host or domain as key for this record and numInlinks, numOutlinks
+ * or score as the value.
+ */
+ public void map(Text key, Node node,
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
+
+ float number = 0;
+ if (inlinks) {
+ number = node.getNumInlinks();
+ }
+ else if (outlinks) {
+ number = node.getNumOutlinks();
+ }
+ else {
+ number = node.getInlinkScore();
+ }
+
+ if (host) {
+ key.set(URLUtil.getHost(key.toString()));
+ } else {
+ key.set(URLUtil.getDomainName(key.toString()));
+ }
+
+ output.collect(key, new FloatWritable(number));
+ }
+
+ /**
+ * Outputs either the sum or the top value for this record.
+ */
+ public void reduce(Text key, Iterator<FloatWritable> values,
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
+
+ long numCollected = 0;
+ float sumOrMax = 0;
+ float val = 0;
+
+ // collect all values, this time with the url as key
+ while (values.hasNext() && (numCollected < topn)) {
+ val = values.next().get();
+
+ if (sum) {
+ sumOrMax += val;
+ } else {
+ if (sumOrMax < val) {
+ sumOrMax = val;
+ }
+ }
+
+ numCollected++;
+ }
+
+ output.collect(key, new FloatWritable(sumOrMax));
+ }
+ }
+
+ /**
* Runs the process to dump the top urls out to a text file.
- *
+ *
* @param webGraphDb The WebGraph from which to pull values.
- *
+ *
* @param topN
* @param output
- *
+ *
* @throws IOException If an error occurs while dumping the top values.
*/
- public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff)
- throws IOException {
+ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
+ throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -172,18 +277,39 @@ public class NodeDumper
dumper.setJobName("NodeDumper: " + webGraphDb);
FileInputFormat.addInputPath(dumper, nodeDb);
dumper.setInputFormat(SequenceFileInputFormat.class);
- dumper.setMapperClass(Sorter.class);
- dumper.setReducerClass(Sorter.class);
- dumper.setMapOutputKeyClass(FloatWritable.class);
- dumper.setMapOutputValueClass(Text.class);
+
+ if (nameType == null) {
+ dumper.setMapperClass(Sorter.class);
+ dumper.setReducerClass(Sorter.class);
+ dumper.setMapOutputKeyClass(FloatWritable.class);
+ dumper.setMapOutputValueClass(Text.class);
+ } else {
+ dumper.setMapperClass(Dumper.class);
+ dumper.setReducerClass(Dumper.class);
+ dumper.setMapOutputKeyClass(Text.class);
+ dumper.setMapOutputValueClass(FloatWritable.class);
+ }
+
dumper.setOutputKeyClass(Text.class);
dumper.setOutputValueClass(FloatWritable.class);
FileOutputFormat.setOutputPath(dumper, output);
- dumper.setOutputFormat(TextOutputFormat.class);
+
+ if (asSequenceFile) {
+ dumper.setOutputFormat(SequenceFileOutputFormat.class);
+ } else {
+ dumper.setOutputFormat(TextOutputFormat.class);
+ }
+
dumper.setNumReduceTasks(1);
dumper.setBoolean("inlinks", type == DumpType.INLINKS);
dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
dumper.setBoolean("scores", type == DumpType.SCORES);
+
+ dumper.setBoolean("host", nameType == NameType.HOST);
+ dumper.setBoolean("domain", nameType == NameType.DOMAIN);
+ dumper.setBoolean("sum", aggrType == AggrType.SUM);
+ dumper.setBoolean("max", aggrType == AggrType.MAX);
+
dumper.setLong("topn", topN);
// Set equals-sign as separator for Solr's ExternalFileField
@@ -233,6 +359,11 @@ public class NodeDumper
"the output directory to use").create("output");
Option effOpts = OptionBuilder.withArgName("asEff").withDescription(
"Solr ExternalFileField compatible output format").create("asEff");
+ Option groupOpts = OptionBuilder.hasArgs(2).withDescription(
+ "group <host|domain> <sum|max>").create("group");
+ Option sequenceFileOpts = OptionBuilder.withArgName("asSequenceFile").withDescription(
+ "whether to output as a sequencefile").create("asSequenceFile");
+
options.addOption(helpOpts);
options.addOption(webGraphDbOpts);
options.addOption(inlinkOpts);
@@ -241,6 +372,8 @@ public class NodeDumper
options.addOption(topNOpts);
options.addOption(outputOpts);
options.addOption(effOpts);
+ options.addOption(groupOpts);
+ options.addOption(sequenceFileOpts);
CommandLineParser parser = new GnuParser();
try {
@@ -256,6 +389,7 @@ public class NodeDumper
boolean inlinks = line.hasOption("inlinks");
boolean outlinks = line.hasOption("outlinks");
boolean scores = line.hasOption("scores");
+
long topN = (line.hasOption("topn")
? Long.parseLong(line.getOptionValue("topn")) : Long.MAX_VALUE);
@@ -264,10 +398,21 @@ public class NodeDumper
DumpType type = (inlinks ? DumpType.INLINKS : outlinks
? DumpType.OUTLINKS : DumpType.SCORES);
+ NameType nameType = null;
+ AggrType aggrType = null;
+ String[] group = line.getOptionValues("group");
+ if (group != null && group.length == 2) {
+ nameType = (group[0].equals("host") ? NameType.HOST : group[0].equals("domain")
+ ? NameType.DOMAIN : null);
+ aggrType = (group[1].equals("sum") ? AggrType.SUM : group[1].equals("sum")
+ ? AggrType.MAX : null);
+ }
+
// Use ExternalFileField?
boolean asEff = line.hasOption("asEff");
+ boolean asSequenceFile = line.hasOption("asSequenceFile");
- dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff);
+ dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff, nameType, aggrType, asSequenceFile);
return 0;
}
catch (Exception e) {
@@ -275,4 +420,4 @@ public class NodeDumper
return -2;
}
}
-}
+}
\ No newline at end of file