You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2023/08/22 08:32:35 UTC
[nutch] branch master updated: NUTCH-2993 ScoringDepth plugin to skip depth check based on URL Pattern - apply patch contributed by Markus Jelsma
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new eae3c52a8 NUTCH-2993 ScoringDepth plugin to skip depth check based on URL Pattern - apply patch contributed by Markus Jelsma
eae3c52a8 is described below
commit eae3c52a8140344dff46c448664a2467d631cefc
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Jul 20 13:44:26 2023 +0200
NUTCH-2993 ScoringDepth plugin to skip depth check based on URL Pattern
- apply patch contributed by Markus Jelsma
---
conf/nutch-default.xml | 16 ++++++++++++++
.../nutch/scoring/depth/DepthScoringFilter.java | 25 ++++++++++++++++++++++
2 files changed, 41 insertions(+)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 273cfccc5..379b5ef5d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1918,6 +1918,22 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>
+<property>
+ <name>scoring.depth.override.pattern</name>
+ <value></value>
+ <description>URLs matching this pattern pass a different max depth value
+ to their outlinks configured in scoring.depth.max.override.
+ </description>
+</property>
+
+<property>
+ <name>scoring.depth.max.override</name>
+ <value></value>
+ <description>This max depth value is passed to outlinks matching the pattern
+ configured in scoring.depth.override.pattern.
+ </description>
+</property>
+
<!-- scoring similarity properties
Add scoring-similarity to the list of active plugins
in the parameter 'plugin.includes' in order to use it.
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index e6aa7a642..6fdf9edd6 100644
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -21,6 +21,8 @@ import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -59,6 +61,8 @@ public class DepthScoringFilter extends Configured implements ScoringFilter {
public static final int DEFAULT_MAX_DEPTH = 1000;
private int defaultMaxDepth;
+ private Pattern depthOverridePattern = null;
+ private int maxDepthOverride = -1;
@Override
public void setConf(Configuration conf) {
@@ -69,6 +73,16 @@ public class DepthScoringFilter extends Configured implements ScoringFilter {
if (defaultMaxDepth <= 0) {
defaultMaxDepth = DEFAULT_MAX_DEPTH;
}
+ String depthOverrideStr = conf.get("scoring.depth.override.pattern");
+ if (depthOverrideStr != null && !depthOverrideStr.isEmpty()) {
+ try {
+ depthOverridePattern = Pattern.compile(depthOverrideStr);
+ maxDepthOverride = conf.getInt("scoring.depth.max.override", 10);
+ } catch (Exception e) {
+ LOG.warn("Unable to compile scoring.depth.override.pattern because: {}",
+ e.getMessage(), e);
+ }
+ }
}
@Override
@@ -93,6 +107,17 @@ public class DepthScoringFilter extends Configured implements ScoringFilter {
curMaxDepth = Integer.parseInt(maxDepthString);
customMaxDepth = new IntWritable(curMaxDepth);
}
+ // If URL matches the pattern, we'll override maxDepth
+ if (depthOverridePattern != null) {
+ Matcher matcher = depthOverridePattern.matcher(fromUrl.toString());
+ if (matcher.find()) {
+ curMaxDepth = maxDepthOverride;
+ customMaxDepth = new IntWritable(maxDepthOverride);
+ } else {
+ curMaxDepth = defaultMaxDepth;
+ customMaxDepth = new IntWritable(curMaxDepth);
+ }
+ }
if (curDepth >= curMaxDepth) {
// depth exceeded - throw away
LOG.info("Depth limit (" + curMaxDepth