You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2022/12/12 15:13:55 UTC
[nutch] branch master updated: NUTCH-2924 Generate maxCount expr evaluated only once
This is an automated email from the ASF dual-hosted git repository.
markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 7d3900450 NUTCH-2924 Generate maxCount expr evaluated only once
7d3900450 is described below
commit 7d390045049036541d2fd94302ab97c8cb3e3cb1
Author: Markus Jelsma <ma...@apache.org>
AuthorDate: Mon Dec 12 16:13:40 2022 +0100
NUTCH-2924 Generate maxCount expr evaluated only once
---
src/java/org/apache/nutch/crawl/Generator.java | 103 +++++++++++--------------
1 file changed, 44 insertions(+), 59 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 0fce6b3b0..8a2f87ba4 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -311,27 +311,30 @@ public class Generator extends NutchTool implements Tool {
private SequenceFile.Reader[] hostdbReaders = null;
private JexlScript maxCountExpr = null;
private JexlScript fetchDelayExpr = null;
-
- public void open() {
- if (conf.get(GENERATOR_HOSTDB) != null) {
- try {
- Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
- hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
- } catch (IOException e) {
- LOG.error("Error reading HostDB because {}", e.getMessage());
- }
+ private Map<String, HostDatum> hostDatumCache = new HashMap<>();
+
+ public void readHostDb() throws IOException {
+ if (conf.get(GENERATOR_HOSTDB) == null) {
+ return;
}
- }
-
- public void close() {
- if (hostdbReaders != null) {
- try {
- for (int i = 0; i < hostdbReaders.length; i++) {
- hostdbReaders[i].close();
+
+ Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
+ hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
+
+ try {
+ Text key = new Text();
+ HostDatum value = new HostDatum();
+ for (int i = 0; i < hostdbReaders.length; i++) {
+ while (hostdbReaders[i].next(key, value)) {
+ hostDatumCache.put(key.toString(), (HostDatum)value.clone());
}
- } catch (IOException e) {
- LOG.error("Error closing HostDB because {}", e.getMessage());
}
+ } catch (Exception e) {
+ throw new IOException(e);
+ }
+
+ for (int i = 0; i < hostdbReaders.length; i++) {
+ hostdbReaders[i].close();
}
}
@@ -402,6 +405,8 @@ public class Generator extends NutchTool implements Tool {
fetchDelayExpr = JexlUtil
.parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
}
+
+ readHostDb();
}
@Override
@@ -414,7 +419,7 @@ public class Generator extends NutchTool implements Tool {
public void reduce(FloatWritable key, Iterable<SelectorEntry> values,
Context context) throws IOException, InterruptedException {
- String hostname = null;
+ String currentHostname = null;
HostDatum host = null;
LongWritable variableFetchDelayWritable = null; // in millis
Text variableFetchDelayKey = new Text("_variableFetchDelay_");
@@ -425,33 +430,31 @@ public class Generator extends NutchTool implements Tool {
String urlString = url.toString();
URL u = null;
- // Do this only once per queue
- if (host == null) {
- try {
- hostname = URLUtil.getHost(urlString);
- host = getHostDatum(hostname);
- } catch (Exception e) {
- }
+ String hostname = URLUtil.getHost(urlString);
+ if (!hostname.equals(currentHostname)) {
+ currentHostname = hostname;
+ host = hostDatumCache.get(hostname);
// Got it?
- if (host == null) {
- // Didn't work, prevent future lookups
- host = new HostDatum();
- } else {
+ if (host != null) {
if (maxCountExpr != null) {
- long variableMaxCount = Math
- .round((double) maxCountExpr.execute(createContext(host)));
- LOG.info("Generator: variable maxCount: {} for {}",
- variableMaxCount, hostname);
- maxCount = (int) variableMaxCount;
+ try {
+ long variableMaxCount = Math.round((double)maxCountExpr.execute(createContext(host)));
+ LOG.debug("Generator: variable maxCount: {} for {}", variableMaxCount, hostname);
+ maxCount = (int)variableMaxCount;
+ } catch (Exception e) {
+ LOG.error("Unable to execute variable maxCount expression because: " + e.getMessage(), e);
+ }
}
if (fetchDelayExpr != null) {
- long variableFetchDelay = Math
- .round((double) fetchDelayExpr.execute(createContext(host)));
- LOG.info("Generator: variable fetchDelay: {} ms for {}",
- variableFetchDelay, hostname);
- variableFetchDelayWritable = new LongWritable(variableFetchDelay);
+ try {
+ long variableFetchDelay = Math.round((double)fetchDelayExpr.execute(createContext(host)));
+ LOG.debug("Generator: variable fetchDelay: {} ms for {}", variableFetchDelay, hostname);
+ variableFetchDelayWritable = new LongWritable(variableFetchDelay);
+ } catch (Exception e) {
+ LOG.error("Unable to execute fetch delay expression because: " + e.getMessage(), e);
+ }
}
}
}
@@ -551,24 +554,6 @@ public class Generator extends NutchTool implements Tool {
private String generateFileName(SelectorEntry entry) {
return "fetchlist-" + entry.segnum.toString() + "/part";
}
-
- private HostDatum getHostDatum(String host) throws Exception {
- Text key = new Text();
- HostDatum value = new HostDatum();
-
- open();
- for (int i = 0; i < hostdbReaders.length; i++) {
- while (hostdbReaders[i].next(key, value)) {
- if (host.equals(key.toString())) {
- close();
- return value;
- }
- }
- }
-
- close();
- return null;
- }
}
public static class DecreasingFloatComparator
@@ -1072,7 +1057,7 @@ public class Generator extends NutchTool implements Tool {
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.out.println(
- "Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
+ "Usage: Generator <crawldb> <segments_dir> [-hostdb <hostdb>] [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
return -1;
}