You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:48 UTC
[32/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java
new file mode 100644
index 0000000..6c1bd9e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Extracts some very basic statistics about domains from the crawldb
+ */
+public class DomainStatistics extends Configured implements Tool {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainStatistics.class);
+
+ private static final Text FETCHED_TEXT = new Text("FETCHED");
+ private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED");
+
+ public static enum MyCounter {
+ FETCHED, NOT_FETCHED, EMPTY_RESULT
+ };
+
+ private static final int MODE_HOST = 1;
+ private static final int MODE_DOMAIN = 2;
+ private static final int MODE_SUFFIX = 3;
+ private static final int MODE_TLD = 4;
+
+ private int mode = 0;
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 3) {
+ System.err.println("Usage: DomainStatistics inputDirs outDir mode [numOfReducer]");
+
+ System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
+ System.err.println("\t\t\tE.g.: crawl/crawldb/");
+
+ System.err.println("\toutDir\t\tOutput directory where results should be dumped");
+
+ System.err.println("\tmode\t\tSet statistics gathering mode");
+ System.err.println("\t\t\t\thost\tGather statistics by host");
+ System.err.println("\t\t\t\tdomain\tGather statistics by domain");
+ System.err.println("\t\t\t\tsuffix\tGather statistics by suffix");
+ System.err.println("\t\t\t\ttld\tGather statistics by top level directory");
+
+ System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+
+ return 1;
+ }
+ String inputDir = args[0];
+ String outputDir = args[1];
+ int numOfReducers = 1;
+
+ if (args.length > 3) {
+ numOfReducers = Integer.parseInt(args[3]);
+ }
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("DomainStatistics: starting at " + sdf.format(start));
+
+ int mode = 0;
+ String jobName = "DomainStatistics";
+ if (args[2].equals("host")) {
+ jobName = "Host statistics";
+ mode = MODE_HOST;
+ } else if (args[2].equals("domain")) {
+ jobName = "Domain statistics";
+ mode = MODE_DOMAIN;
+ } else if (args[2].equals("suffix")) {
+ jobName = "Suffix statistics";
+ mode = MODE_SUFFIX;
+ } else if (args[2].equals("tld")) {
+ jobName = "TLD statistics";
+ mode = MODE_TLD;
+ }
+
+ Configuration conf = getConf();
+ conf.setInt("domain.statistics.mode", mode);
+ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+ Job job = Job.getInstance(conf, jobName);
+ job.setJarByClass(DomainStatistics.class);
+
+ String[] inputDirsSpecs = inputDir.split(",");
+ for (int i = 0; i < inputDirsSpecs.length; i++) {
+ File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+ FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+ }
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ FileOutputFormat.setOutputPath(job, new Path(outputDir));
+ job.setOutputFormatClass(TextOutputFormat.class);
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(LongWritable.class);
+
+ job.setMapperClass(DomainStatisticsMapper.class);
+ job.setReducerClass(DomainStatisticsReducer.class);
+ job.setCombinerClass(DomainStatisticsCombiner.class);
+ job.setNumReduceTasks(numOfReducers);
+
+ try {
+ job.waitForCompletion(true);
+ } catch (Exception e) {
+ throw e;
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
+ return 0;
+ }
+
+ static class DomainStatisticsMapper extends
+ Mapper<Text, CrawlDatum, Text, LongWritable> {
+ int mode = 0;
+
+ public void setup(Context context) {
+ mode = context.getConfiguration().getInt("domain.statistics.mode",
+ MODE_DOMAIN);
+ }
+
+ public void map(Text urlText, CrawlDatum datum, Context context)
+ throws IOException, InterruptedException {
+
+ if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
+ || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+
+ try {
+ URL url = new URL(urlText.toString());
+ String out = null;
+ switch (mode) {
+ case MODE_HOST:
+ out = url.getHost();
+ break;
+ case MODE_DOMAIN:
+ out = URLUtil.getDomainName(url);
+ break;
+ case MODE_SUFFIX:
+ out = URLUtil.getDomainSuffix(url).getDomain();
+ break;
+ case MODE_TLD:
+ out = URLUtil.getTopLevelDomainName(url);
+ break;
+ }
+ if (out.trim().equals("")) {
+ LOG.info("url : " + url);
+ context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
+ }
+
+ context.write(new Text(out), new LongWritable(1));
+ } catch (Exception ex) {
+ }
+
+ context.getCounter(MyCounter.FETCHED).increment(1);
+ context.write(FETCHED_TEXT, new LongWritable(1));
+ } else {
+ context.getCounter(MyCounter.NOT_FETCHED).increment(1);
+ context.write(NOT_FETCHED_TEXT, new LongWritable(1));
+ }
+ }
+ }
+
+ static class DomainStatisticsReducer extends
+ Reducer<Text, LongWritable, LongWritable, Text> {
+ public void reduce(Text key, Iterable<LongWritable> values, Context context)
+ throws IOException, InterruptedException {
+ long total = 0;
+
+ for (LongWritable val : values) {
+ total += val.get();
+ }
+
+ context.write(new LongWritable(total), key);
+ }
+ }
+
+ public static class DomainStatisticsCombiner extends
+ Reducer<Text, LongWritable, Text, LongWritable> {
+ public void reduce(Text key, Iterable<LongWritable> values, Context context)
+ throws IOException, InterruptedException {
+ long total = 0;
+
+ for (LongWritable val : values) {
+ total += val.get();
+ }
+ context.write(key, new LongWritable(total));
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(NutchConfiguration.create(), new DomainStatistics(), args);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java
new file mode 100644
index 0000000..d40ebe9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * This class represents the last part of the host name, which is operated by
+ * authoritives, not individuals. This information is needed to find the domain
+ * name of a host. The domain name of a host is defined to be the last part
+ * before the domain suffix, w/o subdomain names. As an example the domain name
+ * of <br>
+ * <code> http://lucene.apache.org/
+ * </code><br>
+ * is <code> apache.org</code> <br>
+ * This class holds three fields, <strong>domain</strong> field represents the
+ * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score
+ * of url's with this suffix <strong>status</strong> field represents domain's
+ * status
+ *
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ * @see TopLevelDomain for info please see conf/domain-suffixes.xml
+ */
+public class DomainSuffix {
+
+ /**
+ * Enumeration of the status of the tld. Please see domain-suffixes.xml.
+ */
+ public enum Status {
+ INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
+ };
+
+ private String domain;
+ private Status status;
+ private float boost;
+
+ public static final float DEFAULT_BOOST = 1.0f;
+ public static final Status DEFAULT_STATUS = Status.IN_USE;
+
+ public DomainSuffix(String domain, Status status, float boost) {
+ this.domain = domain;
+ this.status = status;
+ this.boost = boost;
+ }
+
+ public DomainSuffix(String domain) {
+ this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
+ }
+
+ public String getDomain() {
+ return domain;
+ }
+
+ public Status getStatus() {
+ return status;
+ }
+
+ public float getBoost() {
+ return boost;
+ }
+
+ @Override
+ public String toString() {
+ return domain;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java
new file mode 100644
index 0000000..765457e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.InputStream;
+import java.util.HashMap;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Storage class for <code>DomainSuffix</code> objects Note: this class is
+ * singleton
+ *
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ */
+public class DomainSuffixes {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainSuffixes.class);
+
+ private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>();
+
+ private static DomainSuffixes instance;
+
+ /** private ctor */
+ private DomainSuffixes() {
+ String file = "domain-suffixes.xml";
+ InputStream input = this.getClass().getClassLoader()
+ .getResourceAsStream(file);
+ try {
+ new DomainSuffixesReader().read(this, input);
+ } catch (Exception ex) {
+ LOG.warn(StringUtils.stringifyException(ex));
+ }
+ }
+
+ /**
+ * Singleton instance, lazy instantination
+ *
+ * @return returns the domain suffix instance
+ */
+ public static DomainSuffixes getInstance() {
+ if (instance == null) {
+ instance = new DomainSuffixes();
+ }
+ return instance;
+ }
+
+ void addDomainSuffix(DomainSuffix tld) {
+ domains.put(tld.getDomain(), tld);
+ }
+
+ /** return whether the extension is a registered domain entry */
+ public boolean isDomainSuffix(String extension) {
+ return domains.containsKey(extension);
+ }
+
+ /**
+ * Return the {@link DomainSuffix} object for the extension, if extension is a
+ * top level domain returned object will be an instance of
+ * {@link TopLevelDomain}
+ *
+ * @param extension
+ * of the domain
+ */
+ public DomainSuffix get(String extension) {
+ return domains.get(extension);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
new file mode 100644
index 0000000..a2a60e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.domain.DomainSuffix.Status;
+import org.apache.nutch.util.domain.TopLevelDomain.Type;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * For parsing xml files containing domain suffix definitions. Parsed xml files
+ * should validate against <code>domain-suffixes.xsd</code>
+ *
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ */
+class DomainSuffixesReader {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainSuffixesReader.class);
+
+ void read(DomainSuffixes tldEntries, InputStream input) throws IOException {
+ try {
+
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setIgnoringComments(true);
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document document = builder.parse(new InputSource(input));
+
+ Element root = document.getDocumentElement();
+
+ if (root != null && root.getTagName().equals("domains")) {
+
+ Element tlds = (Element) root.getElementsByTagName("tlds").item(0);
+ Element suffixes = (Element) root.getElementsByTagName("suffixes")
+ .item(0);
+
+ // read tlds
+ readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds")
+ .item(0));
+ readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds")
+ .item(0));
+ readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds")
+ .item(0));
+
+ readSuffixes(tldEntries, suffixes);
+ } else {
+ throw new IOException("xml file is not valid");
+ }
+ } catch (ParserConfigurationException ex) {
+ LOG.warn(StringUtils.stringifyException(ex));
+ throw new IOException(ex.getMessage());
+ } catch (SAXException ex) {
+ LOG.warn(StringUtils.stringifyException(ex));
+ throw new IOException(ex.getMessage());
+ }
+ }
+
+ void readITLDs(DomainSuffixes tldEntries, Element el) {
+ NodeList children = el.getElementsByTagName("tld");
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+ Type.INFRASTRUCTURE));
+ }
+ }
+
+ void readGTLDs(DomainSuffixes tldEntries, Element el) {
+ NodeList children = el.getElementsByTagName("tld");
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+ Type.GENERIC));
+ }
+ }
+
+ void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
+ NodeList children = el.getElementsByTagName("tld");
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i)));
+ }
+ }
+
+ TopLevelDomain readGTLD(Element el, Type type) {
+ String domain = el.getAttribute("domain");
+ Status status = readStatus(el);
+ float boost = readBoost(el);
+ return new TopLevelDomain(domain, type, status, boost);
+ }
+
+ TopLevelDomain readCCTLD(Element el) throws IOException {
+ String domain = el.getAttribute("domain");
+ Status status = readStatus(el);
+ float boost = readBoost(el);
+ String countryName = readCountryName(el);
+ return new TopLevelDomain(domain, status, boost, countryName);
+ }
+
+ /** read optional field status */
+ Status readStatus(Element el) {
+ NodeList list = el.getElementsByTagName("status");
+ if (list == null || list.getLength() == 0)
+ return DomainSuffix.DEFAULT_STATUS;
+ return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
+ }
+
+ /** read optional field boost */
+ float readBoost(Element el) {
+ NodeList list = el.getElementsByTagName("boost");
+ if (list == null || list.getLength() == 0)
+ return DomainSuffix.DEFAULT_BOOST;
+ return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
+ }
+
+ /**
+ * read field countryname
+ */
+ String readCountryName(Element el) throws IOException {
+ NodeList list = el.getElementsByTagName("country");
+ if (list == null || list.getLength() == 0)
+ throw new IOException("Country name should be given");
+ return list.item(0).getNodeValue();
+ }
+
+ void readSuffixes(DomainSuffixes tldEntries, Element el) {
+ NodeList children = el.getElementsByTagName("suffix");
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readSuffix((Element) children.item(i)));
+ }
+ }
+
+ DomainSuffix readSuffix(Element el) {
+ String domain = el.getAttribute("domain");
+ Status status = readStatus(el);
+ float boost = readBoost(el);
+ return new DomainSuffix(domain, status, boost);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java
new file mode 100644
index 0000000..f442d1f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * (From wikipedia) A top-level domain (TLD) is the last part of an Internet
+ * domain name; that is, the letters which follow the final dot of any domain
+ * name. For example, in the domain name <code>www.website.com</code>, the
+ * top-level domain is <code>com</code>.
+ *
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ *
+ * @see <a href="http://www.iana.org/"> iana.org</a>
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">
+ * Top-level_domain</a>
+ */
+public class TopLevelDomain extends DomainSuffix {
+
+ public enum Type {
+ INFRASTRUCTURE, GENERIC, COUNTRY
+ };
+
+ private Type type;
+ private String countryName = null;
+
+ public TopLevelDomain(String domain, Type type, Status status, float boost) {
+ super(domain, status, boost);
+ this.type = type;
+ }
+
+ public TopLevelDomain(String domain, Status status, float boost,
+ String countryName) {
+ super(domain, status, boost);
+ this.type = Type.COUNTRY;
+ this.countryName = countryName;
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ /**
+ * Returns the country name if TLD is Country Code TLD
+ *
+ * @return country name or null
+ */
+ public String getCountryName() {
+ return countryName;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html b/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html
new file mode 100644
index 0000000..49e0e6a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html
@@ -0,0 +1,14 @@
+<html>
+<body>
+<h2>Classes for domain name analysis.</h2>
+
+for information please refer to following urls :
+<ul>
+<li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li>
+<li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li>
+<li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li>
+<li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li>
+</ul>
+
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/package-info.java b/nutch-core/src/main/java/org/apache/nutch/util/package-info.java
new file mode 100644
index 0000000..053dbc1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Miscellaneous utility classes.
+ */
+package org.apache.nutch.util;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java
new file mode 100644
index 0000000..6fd2396
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui;
+
+import org.apache.nutch.webui.pages.DashboardPage;
+import org.apache.nutch.webui.pages.assets.NutchUiCssReference;
+import org.apache.wicket.markup.html.WebPage;
+import org.apache.wicket.protocol.http.WebApplication;
+import org.apache.wicket.spring.injection.annot.SpringComponentInjector;
+import org.springframework.beans.BeansException;
+import org.springframework.context.ApplicationContext;
+import org.springframework.context.ApplicationContextAware;
+import org.springframework.stereotype.Component;
+
+import de.agilecoders.wicket.core.Bootstrap;
+import de.agilecoders.wicket.core.markup.html.themes.bootstrap.BootstrapCssReference;
+import de.agilecoders.wicket.core.settings.BootstrapSettings;
+import de.agilecoders.wicket.core.settings.SingleThemeProvider;
+import de.agilecoders.wicket.core.settings.Theme;
+import de.agilecoders.wicket.extensions.markup.html.bootstrap.icon.FontAwesomeCssReference;
+
+@Component
+public class NutchUiApplication extends WebApplication implements
+ ApplicationContextAware {
+ private static final String THEME_NAME = "bootstrap";
+ private ApplicationContext context;
+
+ /**
+ * @see org.apache.wicket.Application#getHomePage()
+ */
+ @Override
+ public Class<? extends WebPage> getHomePage() {
+ return DashboardPage.class;
+ }
+
+ /**
+ * @see org.apache.wicket.Application#init()
+ */
+ @Override
+ public void init() {
+ super.init();
+ BootstrapSettings settings = new BootstrapSettings();
+ Bootstrap.install(this, settings);
+ configureTheme(settings);
+
+ getComponentInstantiationListeners().add(
+ new SpringComponentInjector(this, context));
+ }
+
+ private void configureTheme(BootstrapSettings settings) {
+ Theme theme = new Theme(THEME_NAME, BootstrapCssReference.instance(),
+ FontAwesomeCssReference.instance(), NutchUiCssReference.instance());
+ settings.setThemeProvider(new SingleThemeProvider(theme));
+ }
+
+ @Override
+ public void setApplicationContext(ApplicationContext applicationContext)
+ throws BeansException {
+ this.context = applicationContext;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties
new file mode 100644
index 0000000..4c62939
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties
@@ -0,0 +1,63 @@
+#############################################################################
+#Licensed to the Apache Software Foundation (ASF) under one or more
+#contributor license agreements. See the NOTICE file distributed with
+#this work for additional information regarding copyright ownership.
+#The ASF licenses this file to You under the Apache License, Version 2.0
+#(the "License"); you may not use this file except in compliance with
+#the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#############################################################################
+
+navbar.menu.dashboard = Dashboard
+navbar.menu.statistics = Statistics
+navbar.menu.instances = Instances
+navbar.menu.settings = Settings
+navbar.menu.crawls = Crawls
+navbar.menu.scheduling = Scheduling
+navbar.menu.search = Search
+navbar.menu.url = URLs upload
+navbar.menu.seedLists = Seed lists
+
+page.header.seedList = Seed list
+
+navbar.userMenu.settings = Settings
+navbar.userMenu.logout = Log out
+
+menu.settings=Settings
+menu.instances=Instances
+
+connected=Connected
+disconnected=Disconnected
+
+##ENUMS
+ConnectionStatus.CONNECTING=Connecting
+ConnectionStatus.CONNECTED=Connected
+ConnectionStatus.DISCONNECTED=Disconnected
+
+CrawlStatus.NEW=New
+CrawlStatus.ERROR=Error
+CrawlStatus.CRAWLING=Crawling
+CrawlStatus.FINISHED=Finished
+
+instances=Instances
+instances.header.name=Instance name
+instances.header.hostname=Hostname
+instances.header.status=Status
+instances.header.username=Username
+instances.label.name=Instance name
+instances.label.hostname=Hostname
+instances.label.port=Port
+instances.label.username=Username
+instances.label.password=Password
+instances.buttons.addInstance=Add instance
+
+settings=Settings
+settings.header.name = Name
+settings.header.value = Value
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java
new file mode 100644
index 0000000..d534b8f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.wicket.protocol.http.WicketFilter;
+import org.apache.wicket.spring.SpringWebApplicationFactory;
+import org.mortbay.jetty.Handler;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.DefaultServlet;
+import org.mortbay.jetty.servlet.FilterHolder;
+import org.springframework.web.context.ContextLoaderListener;
+import org.springframework.web.context.WebApplicationContext;
+import org.springframework.web.context.request.RequestContextListener;
+import org.springframework.web.context.support.AnnotationConfigWebApplicationContext;
+
+public class NutchUiServer {
+ private static final String APP_FACTORY_NAME = SpringWebApplicationFactory.class
+ .getName();
+ private static final String CONFIG_LOCATION = "org.apache.nutch.webui";
+ private static final String CMD_PORT = "port";
+ private static Integer port = 8080;
+
+ public static void main(String[] args) throws Exception {
+ CommandLineParser parser = new GnuParser();
+ Options options = createWebAppOptions();
+ CommandLine commandLine = null;
+ HelpFormatter formatter = new HelpFormatter();
+ try {
+ commandLine = parser.parse(options, args);
+ } catch (Exception e) {
+ formatter.printHelp("NutchUiServer", options, true);
+ StringUtils.stringifyException(e);
+ }
+
+ if (commandLine.hasOption("help")) {
+ formatter.printHelp("NutchUiServer", options, true);
+ return;
+ }
+ if (commandLine.hasOption(CMD_PORT)) {
+ port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
+ }
+ startServer();
+ }
+
+ private static void startServer() throws Exception, InterruptedException {
+ Server server = new Server(port);
+ Context context = new Context(server, "/", Context.SESSIONS);
+ context.addServlet(DefaultServlet.class, "/*");
+
+ context.addEventListener(new ContextLoaderListener(getContext()));
+ context.addEventListener(new RequestContextListener());
+
+ WicketFilter filter = new WicketFilter();
+ filter.setFilterPath("/");
+ FilterHolder holder = new FilterHolder(filter);
+ holder.setInitParameter("applicationFactoryClassName", APP_FACTORY_NAME);
+ context.addFilter(holder, "/*", Handler.DEFAULT);
+
+ server.setHandler(context);
+ server.start();
+ server.join();
+ }
+
+ private static WebApplicationContext getContext() {
+ AnnotationConfigWebApplicationContext context = new AnnotationConfigWebApplicationContext();
+ context.setConfigLocation(CONFIG_LOCATION);
+ return context;
+ }
+
+ private static Options createWebAppOptions() {
+ Options options = new Options();
+ Option helpOpt = new Option("h", "help", false, "show this help message");
+ OptionBuilder.withDescription("Port to run the WebApplication on.");
+ OptionBuilder.hasOptionalArg();
+ OptionBuilder.withArgName("port number");
+ options.addOption(OptionBuilder.create(CMD_PORT));
+ options.addOption(helpOpt);
+ return options;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java
new file mode 100644
index 0000000..3f8887d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.model.SeedList;
+
+public interface NutchClient {
+
+ public NutchInstance getNutchInstance();
+
+ public NutchStatus getNutchStatus();
+
+ public ConnectionStatus getConnectionStatus();
+
+ public String executeJob(JobConfig jobConfig);
+
+ public JobInfo getJobInfo(String jobId);
+
+ public Map<String, String> getNutchConfig(String config);
+
+ /**
+ * Create seed list and return seed directory location
+ *
+ * @param seedList
+ * @return
+ */
+ public String createSeed(SeedList seedList);
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java
new file mode 100644
index 0000000..32da00e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client;
+
+import java.util.concurrent.ExecutionException;
+
+import org.apache.nutch.webui.client.impl.NutchClientImpl;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.springframework.stereotype.Component;
+
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
+
+@Component
+public class NutchClientFactory {
+ private LoadingCache<NutchInstance, NutchClient> cache;
+
+ public NutchClientFactory() {
+ cache = CacheBuilder.newBuilder().build(new NutchClientCacheLoader());
+ }
+
+ public NutchClient getClient(NutchInstance instance) {
+ try {
+ return cache.get(instance);
+ } catch (ExecutionException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ private static class NutchClientCacheLoader extends
+ CacheLoader<NutchInstance, NutchClient> {
+ @Override
+ public NutchClient load(NutchInstance key) throws Exception {
+ return new NutchClientImpl(key);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
new file mode 100644
index 0000000..2482c06
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import java.util.List;
+
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.JobInfo.State;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+/**
+ * This class implements crawl cycle as in crawl script
+ *
+ * @author feodor
+ *
+ */
+public class CrawlingCycle {
+ private Logger log = LoggerFactory.getLogger(CrawlingCycle.class);
+
+ private CrawlingCycleListener listener;
+ private RemoteCommandExecutor executor;
+ private Crawl crawl;
+
+ private List<RemoteCommand> remoteCommands;
+ private List<RemoteCommand> executedCommands = Lists.newArrayList();
+
+ public CrawlingCycle(CrawlingCycleListener listener,
+ RemoteCommandExecutor executor, Crawl crawl, List<RemoteCommand> commands) {
+ this.listener = listener;
+ this.executor = executor;
+ this.crawl = crawl;
+ this.remoteCommands = commands;
+ }
+
+ public synchronized void executeCrawlCycle() {
+ listener.crawlingStarted(crawl);
+
+ for (RemoteCommand command : remoteCommands) {
+ JobInfo jobInfo = executor.executeRemoteJob(command);
+ command.setJobInfo(jobInfo);
+
+ log.info("Executed remote command data: {}", command);
+
+ if (jobInfo.getState() == State.FAILED) {
+ listener.onCrawlError(crawl, jobInfo.getMsg());
+ return;
+ }
+
+ executedCommands.add(command);
+ listener.commandExecuted(crawl, command, calculateProgress());
+ }
+ listener.crawlingFinished(crawl);
+ }
+
+ private int calculateProgress() {
+ if (CollectionUtils.isEmpty(remoteCommands)) {
+ return 0;
+ }
+ return (int) ((float) executedCommands.size()
+ / (float) remoteCommands.size() * 100);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
new file mode 100644
index 0000000..c2abde5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import org.apache.nutch.webui.client.model.Crawl;
+
+public interface CrawlingCycleListener {
+
+ void crawlingStarted(Crawl crawl);
+
+ void onCrawlError(Crawl crawl, String msg);
+
+ void commandExecuted(Crawl crawl, RemoteCommand command, int progress);
+
+ void crawlingFinished(Crawl crawl);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
new file mode 100644
index 0000000..1a577f9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import static javax.ws.rs.core.MediaType.APPLICATION_JSON;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.model.SeedList;
+
+import com.sun.jersey.api.client.Client;
+import com.sun.jersey.api.client.WebResource;
+import com.sun.jersey.api.client.config.ClientConfig;
+import com.sun.jersey.api.client.config.DefaultClientConfig;
+import com.sun.jersey.api.json.JSONConfiguration;
+
+public class NutchClientImpl implements NutchClient {
+ private Client client;
+ private WebResource nutchResource;
+ private NutchInstance instance;
+
+ public NutchClientImpl(NutchInstance instance) {
+ this.instance = instance;
+ createClient();
+ }
+
+ public void createClient() {
+ ClientConfig clientConfig = new DefaultClientConfig();
+ clientConfig.getFeatures()
+ .put(JSONConfiguration.FEATURE_POJO_MAPPING, true);
+ this.client = Client.create(clientConfig);
+ this.nutchResource = client.resource(instance.getUrl());
+ }
+
+ @Override
+ public NutchStatus getNutchStatus() {
+ return nutchResource.path("/admin").type(APPLICATION_JSON)
+ .get(NutchStatus.class);
+ }
+
+ @Override
+ public ConnectionStatus getConnectionStatus() {
+
+ getNutchStatus();
+ return ConnectionStatus.CONNECTED;
+ // TODO implement disconnected status
+ }
+
+ @Override
+ public String executeJob(JobConfig jobConfig) {
+ JobInfo jobInfo = nutchResource.path("/job/create").type(APPLICATION_JSON)
+ .post(JobInfo.class, jobConfig);
+ return jobInfo.getId();
+ }
+
+ @Override
+ public JobInfo getJobInfo(String jobId) {
+ return nutchResource.path("/job/" + jobId).type(APPLICATION_JSON)
+ .get(JobInfo.class);
+ }
+
+ @Override
+ public NutchInstance getNutchInstance() {
+ return instance;
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Map<String, String> getNutchConfig(String config) {
+ return nutchResource.path("/config/" + config).type(APPLICATION_JSON)
+ .get(Map.class);
+ }
+
+ @Override
+ public String createSeed(SeedList seedList) {
+ return nutchResource.path("/seed/create").type(APPLICATION_JSON)
+ .post(String.class, seedList);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
new file mode 100644
index 0000000..ea19a8a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import java.io.Serializable;
+import java.text.MessageFormat;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.joda.time.Duration;
+
+public class RemoteCommand implements Serializable {
+ private JobConfig jobConfig;
+ private JobInfo jobInfo = new JobInfo();
+ private Duration timeout;
+
+ /**
+ * Use {@link RemoteCommandBuilder} instead
+ */
+ @SuppressWarnings("unused")
+ private RemoteCommand() {
+ }
+
+ public RemoteCommand(JobConfig jobConfig) {
+ this.jobConfig = jobConfig;
+ }
+
+ public JobConfig getJobConfig() {
+ return jobConfig;
+ }
+
+ public void setJobConfig(JobConfig jobConfig) {
+ this.jobConfig = jobConfig;
+ }
+
+ public JobInfo getJobInfo() {
+ return jobInfo;
+ }
+
+ public void setJobInfo(JobInfo jobInfo) {
+ this.jobInfo = jobInfo;
+ }
+
+ public Duration getTimeout() {
+ return timeout;
+ }
+
+ public void setTimeout(Duration timeout) {
+ this.timeout = timeout;
+ }
+
+ @Override
+ public String toString() {
+ String statusInfo = StringUtils.EMPTY;
+ if (jobInfo != null) {
+ statusInfo = MessageFormat.format("{0}", jobInfo.getState());
+ }
+ return MessageFormat.format("{0} status: {1}", jobConfig.getType(),
+ statusInfo);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
new file mode 100644
index 0000000..d6b1767
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo.JobType;
+import org.joda.time.Duration;
+
+public class RemoteCommandBuilder {
+ private JobConfig jobConfig = new JobConfig();
+ private Duration timeout = Duration.standardSeconds(10);
+
+ private RemoteCommandBuilder() {
+ }
+
+ public static RemoteCommandBuilder instance(JobType jobType) {
+ return new RemoteCommandBuilder().withJobType(jobType);
+ }
+
+ public RemoteCommandBuilder withJobType(JobType jobType) {
+ jobConfig.setType(jobType);
+ return this;
+ }
+
+ public RemoteCommandBuilder withConfigId(String configId) {
+ jobConfig.setConfId(configId);
+ return this;
+ }
+
+ public RemoteCommandBuilder withCrawlId(String crawlId) {
+ jobConfig.setCrawlId(crawlId);
+ return this;
+ }
+
+ public RemoteCommandBuilder withArgument(String key, String value) {
+ jobConfig.setArgument(key, value);
+ return this;
+ }
+
+ public RemoteCommandBuilder withTimeout(Duration timeout) {
+ this.timeout = timeout;
+ return this;
+ }
+
+ public RemoteCommand build() {
+ RemoteCommand remoteCommand = new RemoteCommand(jobConfig);
+ remoteCommand.setTimeout(timeout);
+ return remoteCommand;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
new file mode 100644
index 0000000..e1eefc2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.JobInfo.State;
+import org.joda.time.DateTimeConstants;
+import org.joda.time.Duration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class executes remote job and waits for success/failure result
+ *
+ * @author feodor
+ *
+ */
+public class RemoteCommandExecutor {
+ private Logger log = LoggerFactory.getLogger(RemoteCommandExecutor.class);
+
+ private static final int DEFAULT_TIMEOUT_SEC = 60;
+ private Duration requestDelay = new Duration(500);
+
+ private NutchClient client;
+ private ExecutorService executor;
+
+ public RemoteCommandExecutor(NutchClient client) {
+ this.client = client;
+ this.executor = Executors.newSingleThreadExecutor();
+ }
+
+ public JobInfo executeRemoteJob(RemoteCommand command) {
+ try {
+ String jobId = client.executeJob(command.getJobConfig());
+ Future<JobInfo> chekerFuture = executor
+ .submit(new JobStateChecker(jobId));
+ return chekerFuture.get(getTimeout(command), TimeUnit.MILLISECONDS);
+ } catch (Exception e) {
+ log.error("Remote command failed", e);
+ JobInfo jobInfo = new JobInfo();
+ jobInfo.setState(State.FAILED);
+ jobInfo.setMsg(ExceptionUtils.getStackTrace(e));
+ return jobInfo;
+ }
+ }
+
+ private long getTimeout(RemoteCommand command) {
+ if (command.getTimeout() == null) {
+ return DEFAULT_TIMEOUT_SEC * DateTimeConstants.MILLIS_PER_SECOND;
+ }
+ return command.getTimeout().getMillis();
+ }
+
+ public void setRequestDelay(Duration requestDelay) {
+ this.requestDelay = requestDelay;
+ }
+
+ public class JobStateChecker implements Callable<JobInfo> {
+
+ private String jobId;
+
+ public JobStateChecker(String jobId) {
+ this.jobId = jobId;
+ }
+
+ @Override
+ public JobInfo call() throws Exception {
+ while (!Thread.interrupted()) {
+ JobInfo jobInfo = client.getJobInfo(jobId);
+ checkState(jobInfo != null, "Cannot get job info!");
+
+ State state = jobInfo.getState();
+ checkState(state != null, "Unknown job state!");
+
+ if (state == State.RUNNING || state == State.ANY || state == State.IDLE) {
+ Thread.sleep(requestDelay.getMillis());
+ continue;
+ }
+
+ return jobInfo;
+ }
+ return null;
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
new file mode 100644
index 0000000..cef56a5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.JobInfo.JobType;
+import org.joda.time.Duration;
+import org.springframework.beans.factory.config.BeanDefinition;
+import org.springframework.context.annotation.Scope;
+import org.springframework.stereotype.Component;
+
+import com.google.common.collect.Lists;
+
+@Component
+@Scope(BeanDefinition.SCOPE_PROTOTYPE)
+public class RemoteCommandsBatchFactory {
+
+ private List<RemoteCommand> remoteCommands;
+ private Crawl crawl;
+
+ private String batchId;
+
+ public List<RemoteCommand> createCommands(Crawl crawl) {
+ this.crawl = crawl;
+ this.remoteCommands = Lists.newArrayList();
+
+ remoteCommands.add(inject());
+ for (int i = 0; i < crawl.getNumberOfRounds(); i++) {
+ remoteCommands.addAll(createBatchCommands());
+ }
+ return remoteCommands;
+ }
+
+ private List<RemoteCommand> createBatchCommands() {
+ this.batchId = UUID.randomUUID().toString();
+ List<RemoteCommand> batchCommands = Lists.newArrayList();
+
+ batchCommands.add(createGenerateCommand());
+ batchCommands.add(createFetchCommand());
+ batchCommands.add(createParseCommand());
+ batchCommands.add(createUpdateDbCommand());
+ batchCommands.add(createIndexCommand());
+
+ return batchCommands;
+ }
+
+ private RemoteCommand inject() {
+ RemoteCommandBuilder builder = RemoteCommandBuilder
+ .instance(JobType.INJECT).withCrawlId(crawl.getCrawlId())
+ .withArgument("url_dir", crawl.getSeedDirectory());
+ return builder.build();
+ }
+
+ private RemoteCommand createGenerateCommand() {
+ return createBuilder(JobType.GENERATE).build();
+ }
+
+ private RemoteCommand createFetchCommand() {
+ return createBuilder(JobType.FETCH).withTimeout(
+ Duration.standardSeconds(50)).build();
+ }
+
+ private RemoteCommand createParseCommand() {
+ return createBuilder(JobType.PARSE).build();
+ }
+
+ private RemoteCommand createIndexCommand() {
+ return createBuilder(JobType.INDEX).build();
+ }
+
+ private RemoteCommand createUpdateDbCommand() {
+ return createBuilder(JobType.UPDATEDB).build();
+ }
+
+ private RemoteCommandBuilder createBuilder(JobType jobType) {
+ return RemoteCommandBuilder.instance(jobType)
+ .withCrawlId(crawl.getCrawlId()).withArgument("batch", batchId);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
new file mode 100644
index 0000000..d834612
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.model;
+
+public enum ConnectionStatus {
+ CONNECTING, CONNECTED, DISCONNECTED;
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java
new file mode 100644
index 0000000..6057f7f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+
+import org.apache.nutch.webui.model.SeedList;
+
+import com.j256.ormlite.field.DatabaseField;
+
+@Entity
+public class Crawl implements Serializable {
+ public enum CrawlStatus {
+ NEW, CRAWLING, FINISHED, ERROR
+ }
+
+ @Id
+ @GeneratedValue
+ private Long id;
+
+ @Column
+ private String crawlId;
+
+ @Column
+ private String crawlName;
+
+ @Column
+ private CrawlStatus status = CrawlStatus.NEW;
+
+ @Column
+ private Integer numberOfRounds = 1;
+
+ @Column
+ @DatabaseField(foreign = true, foreignAutoRefresh = true)
+ private SeedList seedList;
+
+ @Column
+ private String seedDirectory;
+
+ @Column
+ private int progress;
+
+ public Integer getNumberOfRounds() {
+ return numberOfRounds;
+ }
+
+ public void setNumberOfRounds(Integer numberOfRounds) {
+ this.numberOfRounds = numberOfRounds;
+ }
+
+ public String getCrawlId() {
+ return crawlId;
+ }
+
+ public void setCrawlId(String crawlId) {
+ this.crawlId = crawlId;
+ }
+
+ public CrawlStatus getStatus() {
+ return status;
+ }
+
+ public void setStatus(CrawlStatus status) {
+ this.status = status;
+ }
+
+ public String getCrawlName() {
+ return crawlName;
+ }
+
+ public void setCrawlName(String crawlName) {
+ this.crawlName = crawlName;
+ }
+
+ public SeedList getSeedList() {
+ return seedList;
+ }
+
+ public void setSeedList(SeedList seedList) {
+ this.seedList = seedList;
+ }
+
+ public Long getId() {
+ return id;
+ }
+
+ public void setId(Long id) {
+ this.id = id;
+ }
+
+ public String getSeedDirectory() {
+ return seedDirectory;
+ }
+
+ public void setSeedDirectory(String seedDirectory) {
+ this.seedDirectory = seedDirectory;
+ }
+
+ public int getProgress() {
+ return progress;
+ }
+
+ public void setProgress(int progress) {
+ this.progress = progress;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java
new file mode 100644
index 0000000..80df279
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.JobInfo.JobType;
+
+import com.google.common.collect.Maps;
+
+public class JobConfig implements Serializable {
+ private String crawlId;
+ private JobType type;
+ private String confId = "default";
+ private String jobClassName;
+ private Map<String, Object> args = Maps.newHashMap();
+
+ public void setArgument(String key, String value) {
+ args.put(key, value);
+ }
+
+ public String getCrawlId() {
+ return crawlId;
+ }
+
+ public void setCrawlId(String crawlId) {
+ this.crawlId = crawlId;
+ }
+
+ public JobType getType() {
+ return type;
+ }
+
+ public void setType(JobType type) {
+ this.type = type;
+ }
+
+ public String getConfId() {
+ return confId;
+ }
+
+ public void setConfId(String confId) {
+ this.confId = confId;
+ }
+
+ public Map<String, Object> getArgs() {
+ return Collections.unmodifiableMap(args);
+ }
+
+ public void setArgs(Map<String, Object> args) {
+ this.args = args;
+ }
+
+ public String getJobClassName() {
+ return jobClassName;
+ }
+
+ public void setJobClassName(String jobClass) {
+ this.jobClassName = jobClass;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java
new file mode 100644
index 0000000..312118a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+import java.util.Map;
+
+public class JobInfo implements Serializable {
+ public static enum JobType {
+ INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS
+ };
+
+ public static enum State {
+ IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
+ };
+
+ private String id;
+ private String type;
+ private String confId;
+ private Map<String, Object> args;
+ private Map<String, Object> result;
+ private State state;
+ private String msg;
+ private String crawlId;
+
+ public String getMsg() {
+ return msg;
+ }
+
+ public void setMsg(String msg) {
+ this.msg = msg;
+ }
+
+ public State getState() {
+ return state;
+ }
+
+ public void setState(State state) {
+ this.state = state;
+ }
+
+ public Map<String, Object> getResult() {
+ return result;
+ }
+
+ public void setResult(Map<String, Object> result) {
+ this.result = result;
+ }
+
+ public Map<String, Object> getArgs() {
+ return args;
+ }
+
+ public void setArgs(Map<String, Object> args) {
+ this.args = args;
+ }
+
+ public String getConfId() {
+ return confId;
+ }
+
+ public void setConfId(String confId) {
+ this.confId = confId;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public String getCrawlId() {
+ return crawlId;
+ }
+
+ public void setCrawlId(String crawlId) {
+ this.crawlId = crawlId;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java
new file mode 100644
index 0000000..0c5c425
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java
@@ -0,0 +1,62 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Set;
+
+public class NutchStatus implements Serializable {
+
+ private Date startDate;
+ private Set<String> configuration;
+ private Collection<JobInfo> jobs;
+ private Collection<JobInfo> runningJobs;
+
+ public Date getStartDate() {
+ return startDate;
+ }
+
+ public void setStartDate(Date startDate) {
+ this.startDate = startDate;
+ }
+
+ public Set<String> getConfiguration() {
+ return configuration;
+ }
+
+ public void setConfiguration(Set<String> configuration) {
+ this.configuration = configuration;
+ }
+
+ public Collection<JobInfo> getJobs() {
+ return jobs;
+ }
+
+ public void setJobs(Collection<JobInfo> jobs) {
+ this.jobs = jobs;
+ }
+
+ public Collection<JobInfo> getRunningJobs() {
+ return runningJobs;
+ }
+
+ public void setRunningJobs(Collection<JobInfo> runningJobs) {
+ this.runningJobs = runningJobs;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java
new file mode 100644
index 0000000..09c2d6a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import com.j256.ormlite.dao.Dao;
+import com.j256.ormlite.spring.DaoFactory;
+import com.j256.ormlite.support.ConnectionSource;
+
+public class CustomDaoFactory {
+ private ConnectionSource connectionSource;
+ private List<Dao<?, ?>> registredDaos = Collections
+ .synchronizedList(new ArrayList<Dao<?, ?>>());
+
+ public CustomDaoFactory(ConnectionSource connectionSource) {
+ this.connectionSource = connectionSource;
+ }
+
+ public <T, ID> Dao<T, ID> createDao(Class<T> clazz) {
+ try {
+ Dao<T, ID> dao = DaoFactory.createDao(connectionSource, clazz);
+ register(dao);
+ return dao;
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private <T, ID> void register(Dao<T, ID> dao) {
+ synchronized (registredDaos) {
+ registredDaos.add(dao);
+ }
+ }
+
+ public List<Dao<?, ?>> getCreatedDaos() {
+ synchronized (registredDaos) {
+ return Collections.unmodifiableList(registredDaos);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java
new file mode 100644
index 0000000..9b31d73
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import com.j256.ormlite.dao.BaseDaoImpl;
+import com.j256.ormlite.dao.Dao;
+import com.j256.ormlite.support.ConnectionSource;
+import com.j256.ormlite.table.DatabaseTableConfig;
+import com.j256.ormlite.table.TableUtils;
+
+public class CustomTableCreator {
+
+ private ConnectionSource connectionSource;
+ private List<Dao<?, ?>> configuredDaos;
+
+ public CustomTableCreator(ConnectionSource connectionSource,
+ List<Dao<?, ?>> configuredDaos) {
+ this.connectionSource = connectionSource;
+ this.configuredDaos = configuredDaos;
+ initialize();
+ }
+
+ private void initialize() {
+ if (configuredDaos == null) {
+ throw new IllegalStateException("configuredDaos was not set in "
+ + getClass().getSimpleName());
+ }
+
+ for (Dao<?, ?> dao : configuredDaos) {
+ createTableForDao(dao);
+ }
+ }
+
+ private void createTableForDao(Dao<?, ?> dao) {
+ DatabaseTableConfig<?> tableConfig = getTableConfig(dao);
+ createTableIfNotExists(tableConfig);
+ }
+
+ private DatabaseTableConfig<?> getTableConfig(Dao<?, ?> dao) {
+ Class<?> clazz = dao.getDataClass();
+ DatabaseTableConfig<?> tableConfig = null;
+ if (dao instanceof BaseDaoImpl) {
+ tableConfig = ((BaseDaoImpl<?, ?>) dao).getTableConfig();
+ }
+ if (tableConfig == null) {
+ return getConfigFromClass(clazz);
+ }
+ return tableConfig;
+ }
+
+ private DatabaseTableConfig<?> getConfigFromClass(Class<?> clazz) {
+ try {
+ return DatabaseTableConfig.fromClass(connectionSource, clazz);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void createTableIfNotExists(DatabaseTableConfig<?> tableConfig) {
+ try {
+ TableUtils.createTableIfNotExists(connectionSource, tableConfig);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
new file mode 100644
index 0000000..8b76440
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.NutchInstance;
+
+public class NutchGuiConfiguration {
+ private List<NutchInstance> instances;
+
+ public List<NutchInstance> getInstances() {
+ return instances;
+ }
+
+ public void setInstances(List<NutchInstance> instances) {
+ this.instances = instances;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java
new file mode 100644
index 0000000..1687cee
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.sql.SQLException;
+import java.util.concurrent.Executor;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.scheduling.annotation.AsyncConfigurer;
+import org.springframework.scheduling.annotation.EnableAsync;
+import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
+
+import com.j256.ormlite.dao.Dao;
+import com.j256.ormlite.db.H2DatabaseType;
+import com.j256.ormlite.jdbc.JdbcConnectionSource;
+
+@Configuration
+@EnableAsync
+public class SpringConfiguration implements AsyncConfigurer {
+
+ @Override
+ public Executor getAsyncExecutor() {
+ // TODO move magic numbers to properties file
+ ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
+ executor.setCorePoolSize(7);
+ executor.setMaxPoolSize(42);
+ executor.setQueueCapacity(11);
+ executor.setThreadNamePrefix("SpringExecutor-");
+ executor.initialize();
+ return executor;
+ }
+
+ @Bean
+ public JdbcConnectionSource getConnectionSource() throws SQLException {
+ JdbcConnectionSource source = new JdbcConnectionSource(
+ "jdbc:h2:~/.nutch/config", new H2DatabaseType());
+ source.initialize();
+ return source;
+ }
+
+ @Bean
+ public CustomDaoFactory getDaoFactory() throws SQLException {
+ return new CustomDaoFactory(getConnectionSource());
+ }
+
+ @Bean
+ public Dao<NutchInstance, Long> createNutchDao() throws SQLException {
+ return getDaoFactory().createDao(NutchInstance.class);
+ }
+
+ @Bean
+ public Dao<SeedList, Long> createSeedListDao() throws SQLException {
+ return getDaoFactory().createDao(SeedList.class);
+ }
+
+ @Bean
+ public Dao<SeedUrl, Long> createSeedUrlDao() throws SQLException {
+ return getDaoFactory().createDao(SeedUrl.class);
+ }
+
+ @Bean
+ public Dao<Crawl, Long> createCrawlDao() throws SQLException {
+ return getDaoFactory().createDao(Crawl.class);
+ }
+
+ @Bean
+ public CustomTableCreator createTableCreator() throws SQLException {
+ return new CustomTableCreator(getConnectionSource(), getDaoFactory()
+ .getCreatedDaos());
+ }
+
+}