You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:49 UTC
[33/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
new file mode 100644
index 0000000..a73187b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
@@ -0,0 +1,279 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+// JDK imports
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Tika imports
+import org.apache.tika.Tika;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput;
+
+/**
+ * @author mattmann
+ * @since NUTCH-608
+ * 
+ *        <p>
+ *        This is a facade class to insulate Nutch from its underlying Mime Type
+ *        substrate library, <a href="http://incubator.apache.org/tika/">Apache
+ *        Tika</a>. Any mime handling code should be placed in this utility
+ *        class, and hidden from the Nutch classes that rely on it.
+ *        </p>
+ */
+public final class MimeUtil {
+
+  private static final String SEPARATOR = ";";
+
+  /* our Tika mime type registry */
+  private MimeTypes mimeTypes;
+
+  /* the tika detectors */
+  private Tika tika;
+
+  /* whether or not magic should be employed or not */
+  private boolean mimeMagic;
+
+  /* our log stream */
+  private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class
+      .getName());
+
+  public MimeUtil(Configuration conf) {
+    tika = new Tika();
+    ObjectCache objectCache = ObjectCache.get(conf);
+    MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
+        .getName());
+    if (mimeTypez == null) {
+      try {
+        String customMimeTypeFile = conf.get("mime.types.file");
+        if (customMimeTypeFile != null
+            && customMimeTypeFile.equals("") == false) {
+          try {
+            LOG.info("Using custom mime.types.file: {}", customMimeTypeFile);
+            mimeTypez = MimeTypesFactory.create(conf
+                .getConfResourceAsInputStream(customMimeTypeFile));
+          } catch (Exception e) {
+            LOG.error("Can't load mime.types.file : " + customMimeTypeFile
+                + " using Tika's default");
+          }
+        }
+        if (mimeTypez == null)
+          mimeTypez = MimeTypes.getDefaultMimeTypes();
+      } catch (Exception e) {
+        LOG.error("Exception in MimeUtil " + e.getMessage());
+        throw new RuntimeException(e);
+      }
+      objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
+    }
+
+    this.mimeTypes = mimeTypez;
+    this.mimeMagic = conf.getBoolean("mime.type.magic", true);
+  }
+
+  /**
+   * Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
+   * from a string of the form:
+   * 
+   * <pre>
+   *      &lt;primary type&gt;/&lt;sub type&gt; ; &lt; optional params
+   * </pre>
+   * 
+   * @param origType
+   *          The original mime type string to be cleaned.
+   * @return The primary type, and subtype, concatenated, e.g., the actual mime
+   *         type.
+   */
+  public static String cleanMimeType(String origType) {
+    if (origType == null)
+      return null;
+
+    // take the origType and split it on ';'
+    String[] tokenizedMimeType = origType.split(SEPARATOR);
+    if (tokenizedMimeType.length > 1) {
+      // there was a ';' in there, take the first value
+      return tokenizedMimeType[0];
+    } else {
+      // there wasn't a ';', so just return the orig type
+      return origType;
+    }
+  }
+
+  /**
+   * A facade interface to trying all the possible mime type resolution
+   * strategies available within Tika. First, the mime type provided in
+   * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then
+   * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes}
+   * registry, by its cleaned name. If the {@link MimeType} is found, then that
+   * mime type is used, otherwise URL resolution is used to try and determine
+   * the mime type. However, if <code>mime.type.magic</code> is enabled in
+   * {@link NutchConfiguration}, then mime type magic resolution is used to try
+   * and obtain a better-than-the-default approximation of the {@link MimeType}.
+   * 
+   * @param typeName
+   *          The original mime type, returned from a {@link ProtocolOutput}.
+   * @param url
+   *          The given @see url, that Nutch was trying to crawl.
+   * @param data
+   *          The byte data, returned from the crawl, if any.
+   * @return The correctly, automatically guessed {@link MimeType} name.
+   */
+  public String autoResolveContentType(String typeName, String url, byte[] data) {
+    String retType = null;
+    MimeType type = null;
+    String cleanedMimeType = null;
+
+    cleanedMimeType = MimeUtil.cleanMimeType(typeName);
+    // first try to get the type from the cleaned type name
+    if (cleanedMimeType != null) {
+      try {
+        type = mimeTypes.forName(cleanedMimeType);
+        cleanedMimeType = type.getName();
+      } catch (MimeTypeException mte) {
+        // Seems to be a malformed mime type name...
+        cleanedMimeType = null;
+      }
+    }
+
+    // if returned null, or if it's the default type then try url resolution
+    if (type == null
+        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
+      // If no mime-type header, or cannot find a corresponding registered
+      // mime-type, then guess a mime-type from the url pattern
+      try {
+        retType = tika.detect(url) != null ? tika.detect(url) : null;
+      } catch (Exception e) {
+        String message = "Problem loading default Tika configuration";
+        LOG.error(message, e);
+        throw new RuntimeException(e);
+      }
+    } else {
+      retType = type.getName();
+    }
+
+    // if magic is enabled use mime magic to guess if the mime type returned
+    // from the magic guess is different than the one that's already set so far
+    // if it is, and it's not the default mime type, then go with the mime type
+    // returned by the magic
+    if (this.mimeMagic) {
+      String magicType = null;
+      // pass URL (file name) and (cleansed) content type from protocol to Tika
+      Metadata tikaMeta = new Metadata();
+      tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url);
+      tikaMeta.add(Metadata.CONTENT_TYPE,
+          (cleanedMimeType != null ? cleanedMimeType : typeName));
+      try {
+        InputStream stream = TikaInputStream.get(data);
+        try {
+          magicType = mimeTypes.detect(stream, tikaMeta).toString();
+        } finally {
+          stream.close();
+        }
+      } catch (IOException ignore) {
+      }
+
+      if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+          && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null
+          && !retType.equals(magicType)) {
+
+        // If magic enabled and the current mime type differs from that of the
+        // one returned from the magic, take the magic mimeType
+        retType = magicType;
+      }
+
+      // if type is STILL null after all the resolution strategies, go for the
+      // default type
+      if (retType == null) {
+        try {
+          retType = MimeTypes.OCTET_STREAM;
+        } catch (Exception ignore) {
+        }
+      }
+    }
+
+    return retType;
+  }
+
+  /**
+   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
+   * method.
+   * 
+   * @param url
+   *          A string representation of the document {@link URL} to sense the
+   *          {@link MimeType} for.
+   * @return An appropriate {@link MimeType}, identified from the given Document
+   *         url in string form.
+   */
+  public String getMimeType(String url) {
+    return tika.detect(url);
+  }
+
+  /**
+   * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
+   * method.
+   * 
+   * @param name
+   *          The name of a valid {@link MimeType} in the Tika mime registry.
+   * @return The object representation of the {@link MimeType}, if it exists, or
+   *         null otherwise.
+   */
+  public String forName(String name) {
+    try {
+      return this.mimeTypes.forName(name).toString();
+    } catch (MimeTypeException e) {
+      LOG.error("Exception getting mime type by name: [" + name
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
+  }
+
+  /**
+   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
+   * method.
+   * 
+   * @param f
+   *          The {@link File} to sense the {@link MimeType} for.
+   * @return The {@link MimeType} of the given {@link File}, or null if it
+   *         cannot be determined.
+   */
+  public String getMimeType(File f) {
+    try {
+      return tika.detect(f);
+    } catch (Exception e) {
+      LOG.error("Exception getting mime type for file: [" + f.getPath()
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
new file mode 100644
index 0000000..c99bae0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.Stack;
+
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * <p>
+ * A utility class that allows the walking of any DOM tree using a stack instead
+ * of recursion. As the node tree is walked the next node is popped off of the
+ * stack and all of its children are automatically added to the stack to be
+ * called in tree order.
+ * </p>
+ * 
+ * <p>
+ * Currently this class is not thread safe. It is assumed that only one thread
+ * will be accessing the <code>NodeWalker</code> at any given time.
+ * </p>
+ */
+public class NodeWalker {
+
+  // the root node the the stack holding the nodes
+  private Node currentNode;
+  private NodeList currentChildren;
+  private Stack<Node> nodes;
+
+  /**
+   * Starts the <code>Node</code> tree from the root node.
+   * 
+   * @param rootNode
+   */
+  public NodeWalker(Node rootNode) {
+
+    nodes = new Stack<Node>();
+    nodes.add(rootNode);
+  }
+
+  /**
+   * <p>
+   * Returns the next <code>Node</code> on the stack and pushes all of its
+   * children onto the stack, allowing us to walk the node tree without the use
+   * of recursion. If there are no more nodes on the stack then null is
+   * returned.
+   * </p>
+   * 
+   * @return Node The next <code>Node</code> on the stack or null if there isn't
+   *         a next node.
+   */
+  public Node nextNode() {
+
+    // if no next node return null
+    if (!hasNext()) {
+      return null;
+    }
+
+    // pop the next node off of the stack and push all of its children onto
+    // the stack
+    currentNode = nodes.pop();
+    currentChildren = currentNode.getChildNodes();
+    int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
+
+    // put the children node on the stack in first to last order
+    for (int i = childLen - 1; i >= 0; i--) {
+      nodes.add(currentChildren.item(i));
+    }
+
+    return currentNode;
+  }
+
+  /**
+   * <p>
+   * Skips over and removes from the node stack the children of the last node.
+   * When getting a next node from the walker, that node's children are
+   * automatically added to the stack. You can call this method to remove those
+   * children from the stack.
+   * </p>
+   * 
+   * <p>
+   * This is useful when you don't want to process deeper into the current path
+   * of the node tree but you want to continue processing sibling nodes.
+   * </p>
+   * 
+   */
+  public void skipChildren() {
+
+    int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
+
+    for (int i = 0; i < childLen; i++) {
+      Node child = nodes.peek();
+      if (child.equals(currentChildren.item(i))) {
+        nodes.pop();
+      }
+    }
+  }
+
+  /**
+   * Return the current node.
+   * 
+   * @return Node
+   */
+  public Node getCurrentNode() {
+    return currentNode;
+  }
+
+  /**
+   * @return returns true if there are more nodes on the current stack.
+   * 
+   */
+  public boolean hasNext() {
+    return (nodes.size() > 0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
new file mode 100644
index 0000000..ac71a93
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.UUID;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Utility to create Hadoop {@link Configuration}s that include Nutch-specific
+ * resources.
+ */
+public class NutchConfiguration {
+  public static final String UUID_KEY = "nutch.conf.uuid";
+
+  private NutchConfiguration() {
+  } // singleton
+
+  /*
+   * Configuration.hashCode() doesn't return values that correspond to a unique
+   * set of parameters. This is a workaround so that we can track instances of
+   * Configuration created by Nutch.
+   */
+  private static void setUUID(Configuration conf) {
+    UUID uuid = UUID.randomUUID();
+    conf.set(UUID_KEY, uuid.toString());
+  }
+
+  /**
+   * Retrieve a Nutch UUID of this configuration object, or null if the
+   * configuration was created elsewhere.
+   * 
+   * @param conf
+   *          configuration instance
+   * @return uuid or null
+   */
+  public static String getUUID(Configuration conf) {
+    return conf.get(UUID_KEY);
+  }
+
+  /**
+   * Create a {@link Configuration} for Nutch. This will load the standard Nutch
+   * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code>
+   * overrides.
+   */
+  public static Configuration create() {
+    Configuration conf = new Configuration();
+    setUUID(conf);
+    addNutchResources(conf);
+    return conf;
+  }
+
+  /**
+   * Create a {@link Configuration} from supplied properties.
+   * 
+   * @param addNutchResources
+   *          if true, then first <code>nutch-default.xml</code>, and then
+   *          <code>nutch-site.xml</code> will be loaded prior to applying the
+   *          properties. Otherwise these resources won't be used.
+   * @param nutchProperties
+   *          a set of properties to define (or override)
+   */
+  public static Configuration create(boolean addNutchResources,
+      Properties nutchProperties) {
+    Configuration conf = new Configuration();
+    setUUID(conf);
+    if (addNutchResources) {
+      addNutchResources(conf);
+    }
+    for (Entry<Object, Object> e : nutchProperties.entrySet()) {
+      conf.set(e.getKey().toString(), e.getValue().toString());
+    }
+    return conf;
+  }
+
+  /**
+   * Add the standard Nutch resources to {@link Configuration}.
+   * 
+   * @param conf
+   *          Configuration object to which configuration is to be added.
+   */
+  private static Configuration addNutchResources(Configuration conf) {
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site.xml");
+    return conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
new file mode 100644
index 0000000..8b4f8e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+/** A {@link JobConf} for Nutch jobs. */
+public class NutchJob extends JobConf {
+
+  public NutchJob(Configuration conf) {
+    super(conf, NutchJob.class);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
new file mode 100644
index 0000000..8e75177
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.nutch.metadata.Nutch;
+
+public abstract class NutchTool extends Configured {
+
+  protected HashMap<String, Object> results = new HashMap<String, Object>();
+  protected Map<String, Object> status = Collections
+      .synchronizedMap(new HashMap<String, Object>());
+  protected Job currentJob;
+  protected int numJobs;
+  protected int currentJobNum;
+
+  /**
+   * Runs the tool, using a map of arguments. May return results, or null.
+   */
+  public abstract Map<String, Object> run(Map<String, Object> args, String crawlId)
+      throws Exception;
+
+  public NutchTool(Configuration conf){
+    super(conf);
+  }
+
+  public NutchTool(){
+    super(null);
+  }
+
+  /** Returns relative progress of the tool, a float in range [0,1]. */
+  public float getProgress() {
+    float res = 0;
+    if (currentJob != null) {
+      try {
+        res = (currentJob.mapProgress() + currentJob.reduceProgress()) / 2.0f;
+      } catch (IOException e) {
+        e.printStackTrace();
+        res = 0;
+      } catch (IllegalStateException ile) {
+        ile.printStackTrace();
+        res = 0;
+      }
+    }
+    // take into account multiple jobs
+    if (numJobs > 1) {
+      res = (currentJobNum + res) / (float) numJobs;
+    }
+    status.put(Nutch.STAT_PROGRESS, res);
+    return res;
+  }
+
+  /** Returns current status of the running tool. */
+  public Map<String, Object> getStatus() {
+    return status;
+  }
+
+  /**
+   * Stop the job with the possibility to resume. Subclasses should override
+   * this, since by default it calls {@link #killJob()}.
+   * 
+   * @return true if succeeded, false otherwise
+   */
+  public boolean stopJob() throws Exception {
+    return killJob();
+  }
+
+  /**
+   * Kill the job immediately. Clients should assume that any results that the
+   * job produced so far are in inconsistent state or missing.
+   * 
+   * @return true if succeeded, false otherwise.
+   * @throws Exception
+   */
+  public boolean killJob() throws Exception {
+    if (currentJob != null && !currentJob.isComplete()) {
+      try {
+        currentJob.killJob();
+        return true;
+      } catch (Exception e) {
+        e.printStackTrace();
+        return false;
+      }
+    }
+    return false;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
new file mode 100644
index 0000000..0277ee6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.HashMap;
+import java.util.WeakHashMap;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class ObjectCache {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class);
+
+  private static final WeakHashMap<Configuration, ObjectCache> CACHE = new WeakHashMap<Configuration, ObjectCache>();
+
+  private final HashMap<String, Object> objectMap;
+
+  private ObjectCache() {
+    objectMap = new HashMap<String, Object>();
+  }
+
+  public synchronized static ObjectCache get(Configuration conf) {
+    ObjectCache objectCache = CACHE.get(conf);
+    if (objectCache == null) {
+      LOG.debug("No object cache found for conf=" + conf
+          + ", instantiating a new object cache");
+      objectCache = new ObjectCache();
+      CACHE.put(conf, objectCache);
+    }
+    return objectCache;
+  }
+
+  public synchronized Object getObject(String key) {
+    return objectMap.get(key);
+  }
+
+  public synchronized void setObject(String key, Object value) {
+    objectMap.put(key, value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
new file mode 100644
index 0000000..e323b67
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A class for efficiently matching <code>String</code>s against a set of
+ * prefixes.
+ */
+public class PrefixStringMatcher extends TrieStringMatcher {
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any prefix in the supplied array. Zero-length
+   * <code>Strings</code> are ignored.
+   */
+  public PrefixStringMatcher(String[] prefixes) {
+    super();
+    for (int i = 0; i < prefixes.length; i++)
+      addPatternForward(prefixes[i]);
+  }
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any prefix in the supplied
+   * <code>Collection</code>.
+   * 
+   * @throws ClassCastException
+   *           if any <code>Object</code>s in the collection are not
+   *           <code>String</code>s
+   */
+  public PrefixStringMatcher(Collection<String> prefixes) {
+    super();
+    Iterator<String> iter = prefixes.iterator();
+    while (iter.hasNext())
+      addPatternForward(iter.next());
+  }
+
+  /**
+   * Returns true if the given <code>String</code> is matched by a prefix in the
+   * trie
+   */
+  public boolean matches(String input) {
+    TrieNode node = root;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return false;
+      if (node.isTerminal())
+        return true;
+    }
+    return false;
+  }
+
+  /**
+   * Returns the shortest prefix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String shortestMatch(String input) {
+    TrieNode node = root;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return null;
+      if (node.isTerminal())
+        return input.substring(0, i + 1);
+    }
+    return null;
+  }
+
+  /**
+   * Returns the longest prefix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String longestMatch(String input) {
+    TrieNode node = root;
+    String result = null;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        break;
+      if (node.isTerminal())
+        result = input.substring(0, i + 1);
+    }
+    return result;
+  }
+
+  public static final void main(String[] argv) {
+    PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
+        "abcd", "abc", "aac", "baz", "foo", "foobar" });
+
+    String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
+        "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+    for (int i = 0; i < tests.length; i++) {
+      System.out.println("testing: " + tests[i]);
+      System.out.println("   matches: " + matcher.matches(tests[i]));
+      System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));
+      System.out.println("   longest: " + matcher.longestMatch(tests[i]));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
new file mode 100644
index 0000000..d26cbfc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.metadata.Nutch;
+
+/**
+ * Extracts protocol status code information from the crawl database.
+ *
+ * ProtocolStatusStatistics will give you information on the count
+ * of all status codes encountered on your crawl. This can be useful
+ * for checking a number of things.
+ *
+ * An example output run showing the number of encountered status
+ * codes such as 200, 300, and a count of un-fetched record.
+ *
+ * 38	200
+ * 19	301
+ * 2	302
+ * 665	UNFETCHED
+ *
+ */
+public class ProtocolStatusStatistics extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ProtocolStatusStatistics.class);
+
+  private static final Text UNFETCHED_TEXT = new Text("UNFETCHED");
+
+  public static Configuration conf;
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println("Usage: ProtocolStatistics inputDirs outDir [numOfReducer]");
+
+      System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
+      System.err.println("\t\t\tE.g.: crawl/crawldb/");
+
+      System.err.println("\toutDir\t\tOutput directory where results should be dumped");
+
+      System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+      return 1;
+    }
+    String inputDir = args[0];
+    String outputDir = args[1];
+
+    int numOfReducers = 1;
+
+    if (args.length > 3) {
+      numOfReducers = Integer.parseInt(args[3]);
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+
+    String jobName = "ProtocolStatistics";
+
+    conf = getConf();
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    Job job = Job.getInstance(conf, jobName);
+    job.setJarByClass(ProtocolStatusStatistics.class);
+
+    String[] inputDirsSpecs = inputDir.split(",");
+    for (int i = 0; i < inputDirsSpecs.length; i++) {
+      File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+    }
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, new Path(outputDir));
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+
+    job.setMapperClass(ProtocolStatusStatisticsMapper.class);
+    job.setReducerClass(ProtocolStatusStatisticsReducer.class);
+    job.setCombinerClass(ProtocolStatusStatisticsCombiner.class);
+    job.setNumReduceTasks(numOfReducers);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  static class ProtocolStatusStatisticsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
+      if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+        context.write((Text) datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY), new LongWritable(1));
+      } else {
+        context.write(UNFETCHED_TEXT, new LongWritable(1));
+      }
+    }
+  }
+
+  static class ProtocolStatusStatisticsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+
+      context.write(new LongWritable(total), key);
+    }
+  }
+
+  public static class ProtocolStatusStatisticsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+      context.write(key, new LongWritable(total));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(NutchConfiguration.create(), new ProtocolStatusStatistics(), args);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
new file mode 100644
index 0000000..149269f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+/**
+ * A collection of String processing utility methods.
+ */
+public class StringUtil {
+
+  /**
+   * Returns a copy of <code>s</code> padded with trailing spaces so that it's
+   * length is <code>length</code>. Strings already <code>length</code>
+   * characters long or longer are not altered.
+   */
+  public static String rightPad(String s, int length) {
+    StringBuffer sb = new StringBuffer(s);
+    for (int i = length - s.length(); i > 0; i--)
+      sb.append(" ");
+    return sb.toString();
+  }
+
+  /**
+   * Returns a copy of <code>s</code> padded with leading spaces so that it's
+   * length is <code>length</code>. Strings already <code>length</code>
+   * characters long or longer are not altered.
+   */
+  public static String leftPad(String s, int length) {
+    StringBuffer sb = new StringBuffer();
+    for (int i = length - s.length(); i > 0; i--)
+      sb.append(" ");
+    sb.append(s);
+    return sb.toString();
+  }
+
+  private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6',
+      '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+  /**
+   * Convenience call for {@link #toHexString(byte[], String, int)}, where
+   * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+   * 
+   * @param buf
+   */
+  public static String toHexString(byte[] buf) {
+    return toHexString(buf, null, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Get a text representation of a byte[] as hexadecimal String, where each
+   * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+   * 
+   * @param buf
+   *          input data
+   * @param sep
+   *          separate every pair of hexadecimal digits with this separator, or
+   *          null if no separation is needed.
+   * @param lineLen
+   *          break the output String into lines containing output for lineLen
+   *          bytes.
+   */
+  public static String toHexString(byte[] buf, String sep, int lineLen) {
+    if (buf == null)
+      return null;
+    if (lineLen <= 0)
+      lineLen = Integer.MAX_VALUE;
+    StringBuffer res = new StringBuffer(buf.length * 2);
+    for (int i = 0; i < buf.length; i++) {
+      int b = buf[i];
+      res.append(HEX_DIGITS[(b >> 4) & 0xf]);
+      res.append(HEX_DIGITS[b & 0xf]);
+      if (i > 0 && (i % lineLen) == 0)
+        res.append('\n');
+      else if (sep != null && i < lineLen - 1)
+        res.append(sep);
+    }
+    return res.toString();
+  }
+
+  /**
+   * Convert a String containing consecutive (no inside whitespace) hexadecimal
+   * digits into a corresponding byte array. If the number of digits is not
+   * even, a '0' will be appended in the front of the String prior to
+   * conversion. Leading and trailing whitespace is ignored.
+   * 
+   * @param text
+   *          input text
+   * @return converted byte array, or null if unable to convert
+   */
+  public static byte[] fromHexString(String text) {
+    text = text.trim();
+    if (text.length() % 2 != 0)
+      text = "0" + text;
+    int resLen = text.length() / 2;
+    int loNibble, hiNibble;
+    byte[] res = new byte[resLen];
+    for (int i = 0; i < resLen; i++) {
+      int j = i << 1;
+      hiNibble = charToNibble(text.charAt(j));
+      loNibble = charToNibble(text.charAt(j + 1));
+      if (loNibble == -1 || hiNibble == -1)
+        return null;
+      res[i] = (byte) (hiNibble << 4 | loNibble);
+    }
+    return res;
+  }
+
+  private static final int charToNibble(char c) {
+    if (c >= '0' && c <= '9') {
+      return c - '0';
+    } else if (c >= 'a' && c <= 'f') {
+      return 0xa + (c - 'a');
+    } else if (c >= 'A' && c <= 'F') {
+      return 0xA + (c - 'A');
+    } else {
+      return -1;
+    }
+  }
+
+  /**
+   * Checks if a string is empty (ie is null or empty).
+   */
+  public static boolean isEmpty(String str) {
+    return (str == null) || (str.equals(""));
+  }
+
+  /**
+   * Simple character substitution which cleans all \ufffd chars from a given String.
+   */
+  public static String cleanField(String value) {
+    return value.replaceAll("\ufffd", "");
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1)
+      System.out.println("Usage: StringUtil <encoding name>");
+    else
+      System.out.println(args[0] + " is resolved to "
+          + EncodingDetector.resolveEncodingAlias(args[0]));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
new file mode 100644
index 0000000..a967c01
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A class for efficiently matching <code>String</code>s against a set of
+ * suffixes. Zero-length <code>Strings</code> are ignored.
+ */
+public class SuffixStringMatcher extends TrieStringMatcher {
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any suffix in the supplied array.
+   */
+  public SuffixStringMatcher(String[] suffixes) {
+    super();
+    for (int i = 0; i < suffixes.length; i++)
+      addPatternBackward(suffixes[i]);
+  }
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any suffix in the supplied
+   * <code>Collection</code>
+   */
+  public SuffixStringMatcher(Collection<String> suffixes) {
+    super();
+    Iterator<String> iter = suffixes.iterator();
+    while (iter.hasNext())
+      addPatternBackward(iter.next());
+  }
+
+  /**
+   * Returns true if the given <code>String</code> is matched by a suffix in the
+   * trie
+   */
+  public boolean matches(String input) {
+    TrieNode node = root;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return false;
+      if (node.isTerminal())
+        return true;
+    }
+    return false;
+  }
+
+  /**
+   * Returns the shortest suffix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String shortestMatch(String input) {
+    TrieNode node = root;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return null;
+      if (node.isTerminal())
+        return input.substring(i);
+    }
+    return null;
+  }
+
+  /**
+   * Returns the longest suffix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String longestMatch(String input) {
+    TrieNode node = root;
+    String result = null;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        break;
+      if (node.isTerminal())
+        result = input.substring(i);
+    }
+    return result;
+  }
+
+  public static final void main(String[] argv) {
+    SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a",
+        "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" });
+
+    String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
+        "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+    for (int i = 0; i < tests.length; i++) {
+      System.out.println("testing: " + tests[i]);
+      System.out.println("   matches: " + matcher.matches(tests[i]));
+      System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));
+      System.out.println("   longest: " + matcher.longestMatch(tests[i]));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
new file mode 100644
index 0000000..68ded69
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.util;
+
+import org.apache.commons.lang.StringUtils;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+
+public class TableUtil {
+
+  public static final ByteBuffer YES_VAL = ByteBuffer.wrap(new byte[] { 'y' });
+
+  /**
+   * Reverses a url's domain. This form is better for storing in hbase. Because
+   * scans within the same domain are faster.
+   * <p>
+   * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
+   * "com.foo.bar:8983:http/to/index.html?a=b".
+   * 
+   * @param url
+   *          url to be reversed
+   * @return Reversed url
+   * @throws MalformedURLException
+   */
+  public static String reverseUrl(String urlString)
+      throws MalformedURLException {
+    return reverseUrl(new URL(urlString));
+  }
+
+  /**
+   * Reverses a url's domain. This form is better for storing in hbase. Because
+   * scans within the same domain are faster.
+   * <p>
+   * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
+   * "com.foo.bar:http:8983/to/index.html?a=b".
+   * 
+   * @param url
+   *          url to be reversed
+   * @return Reversed url
+   */
+  public static String reverseUrl(URL url) {
+    String host = url.getHost();
+    String file = url.getFile();
+    String protocol = url.getProtocol();
+    int port = url.getPort();
+
+    StringBuilder buf = new StringBuilder();
+
+    /* reverse host */
+    reverseAppendSplits(host, buf);
+
+    /* add protocol */
+    buf.append(':');
+    buf.append(protocol);
+
+    /* add port if necessary */
+    if (port != -1) {
+      buf.append(':');
+      buf.append(port);
+    }
+
+    /* add path */
+    if (file.length() > 0 && '/' != file.charAt(0)) {
+      buf.append('/');
+    }
+    buf.append(file);
+
+    return buf.toString();
+  }
+
+  public static String unreverseUrl(String reversedUrl) {
+    StringBuilder buf = new StringBuilder(reversedUrl.length() + 2);
+
+    int pathBegin = reversedUrl.indexOf('/');
+    if (pathBegin == -1)
+      pathBegin = reversedUrl.length();
+    String sub = reversedUrl.substring(0, pathBegin);
+
+    String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed
+                                                                    // host>,
+                                                                    // <port>,
+                                                                    // <protocol>}
+
+    buf.append(splits[1]); // add protocol
+    buf.append("://");
+    reverseAppendSplits(splits[0], buf); // splits[0] is reversed
+    // host
+    if (splits.length == 3) { // has a port
+      buf.append(':');
+      buf.append(splits[2]);
+    }
+    buf.append(reversedUrl.substring(pathBegin));
+    return buf.toString();
+  }
+
+  /**
+   * Given a reversed url, returns the reversed host E.g
+   * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
+   * 
+   * @param reversedUrl
+   *          Reversed url
+   * @return Reversed host
+   */
+  public static String getReversedHost(String reversedUrl) {
+    return reversedUrl.substring(0, reversedUrl.indexOf(':'));
+  }
+
+  private static void reverseAppendSplits(String string, StringBuilder buf) {
+    String[] splits = StringUtils.split(string, '.');
+    if (splits.length > 0) {
+      for (int i = splits.length - 1; i > 0; i--) {
+        buf.append(splits[i]);
+        buf.append('.');
+      }
+      buf.append(splits[0]);
+    } else {
+      buf.append(string);
+    }
+  }
+
+  public static String reverseHost(String hostName) {
+    StringBuilder buf = new StringBuilder();
+    reverseAppendSplits(hostName, buf);
+    return buf.toString();
+
+  }
+
+  public static String unreverseHost(String reversedHostName) {
+    return reverseHost(reversedHostName); // Reversible
+  }
+
+  /**
+   * Convert given Utf8 instance to String and and cleans out any offending "\ufffd"
+   * from the String.
+   * 
+   * 
+   * @param utf8
+   *          Utf8 object
+   * @return string-ifed Utf8 object or null if Utf8 instance is null
+   */
+  public static String toString(CharSequence utf8) {
+    return (utf8 == null ? null : StringUtil.cleanField(utf8.toString()));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
new file mode 100644
index 0000000..c4af356
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.concurrent.TimeUnit;
+
+public class TimingUtil {
+
+  /**
+   * Calculate the elapsed time between two times specified in milliseconds.
+   * 
+   * @param start
+   *          The start of the time period
+   * @param end
+   *          The end of the time period
+   * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
+   *         minutes and Z seconds or null if start > end.
+   */
+  public static String elapsedTime(long start, long end) {
+    if (start > end) {
+      return null;
+    }
+    return secondsToHMS((end-start)/1000);
+  }
+  
+  /**
+   * Show time in seconds as hours, minutes and seconds (hh:mm:ss)
+   * 
+   * @param seconds
+   *          (elapsed) time in seconds
+   * @return human readable time string "hh:mm:ss"
+   */
+  public static String secondsToHMS(long seconds) {
+    long hours = TimeUnit.SECONDS.toHours(seconds);
+    long minutes = TimeUnit.SECONDS.toMinutes(seconds)
+        % TimeUnit.HOURS.toMinutes(1);
+    seconds = TimeUnit.SECONDS.toSeconds(seconds)
+        % TimeUnit.MINUTES.toSeconds(1);
+    return String.format("%02d:%02d:%02d", hours, minutes, seconds);
+  }
+
+  /**
+   * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss)
+   * 
+   * @param seconds
+   *          (elapsed) time in seconds
+   * @return human readable time string "d days, hh:mm:ss"
+   */
+  public static String secondsToDaysHMS(long seconds) {
+    long days = TimeUnit.SECONDS.toDays(seconds);
+    if (days == 0)
+      return secondsToHMS(seconds);
+    String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1));
+    return String.format("%d days, %s", days, hhmmss);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
new file mode 100644
index 0000000..95f06ad
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -0,0 +1,202 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * TrieStringMatcher is a base class for simple tree-based string matching.
+ * 
+ */
+public abstract class TrieStringMatcher {
+  protected TrieNode root;
+
+  protected TrieStringMatcher() {
+    this.root = new TrieNode('\000', false);
+  }
+
+  /**
+   * Node class for the character tree.
+   */
+  protected class TrieNode implements Comparable<TrieNode> {
+    protected TrieNode[] children;
+    protected LinkedList<TrieNode> childrenList;
+    protected char nodeChar;
+    protected boolean terminal;
+
+    /**
+     * Creates a new TrieNode, which contains the given <code>nodeChar</code>.
+     * If <code>isTerminal</code> is <code>true</code>, the new node is a
+     * <em>terminal</em> node in the trie.
+     */
+    TrieNode(char nodeChar, boolean isTerminal) {
+      this.nodeChar = nodeChar;
+      this.terminal = isTerminal;
+      this.childrenList = new LinkedList<TrieNode>();
+    }
+
+    /**
+     * Returns <code>true</code> if this node is a <em>terminal</em> node in the
+     * trie.
+     */
+    boolean isTerminal() {
+      return terminal;
+    }
+
+    /**
+     * Returns the child node of this node whose node-character is
+     * <code>nextChar</code>. If no such node exists, one will be is added. If
+     * <em>isTerminal</em> is <code>true</code>, the node will be a terminal
+     * node in the trie.
+     */
+    TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) {
+      if (childrenList == null) {
+        childrenList = new LinkedList<TrieNode>();
+        childrenList.addAll(Arrays.asList(children));
+        children = null;
+      }
+
+      if (childrenList.size() == 0) {
+        TrieNode newNode = new TrieNode(nextChar, isTerminal);
+        childrenList.add(newNode);
+        return newNode;
+      }
+
+      ListIterator<TrieNode> iter = childrenList.listIterator();
+      TrieNode node = iter.next();
+      while ((node.nodeChar < nextChar) && iter.hasNext())
+        node = iter.next();
+
+      if (node.nodeChar == nextChar) {
+        node.terminal = node.terminal | isTerminal;
+        return node;
+      }
+
+      if (node.nodeChar > nextChar)
+        iter.previous();
+
+      TrieNode newNode = new TrieNode(nextChar, isTerminal);
+      iter.add(newNode);
+      return newNode;
+    }
+
+    /**
+     * Returns the child node of this node whose node-character is
+     * <code>nextChar</code>. If no such node exists, <code>null</code> is
+     * returned.
+     */
+    TrieNode getChild(char nextChar) {
+      if (children == null) {
+        children = childrenList.toArray(new TrieNode[childrenList.size()]);
+        childrenList = null;
+        Arrays.sort(children);
+      }
+
+      int min = 0;
+      int max = children.length - 1;
+      int mid = 0;
+      while (min < max) {
+        mid = (min + max) / 2;
+        if (children[mid].nodeChar == nextChar)
+          return children[mid];
+        if (children[mid].nodeChar < nextChar)
+          min = mid + 1;
+        else
+          // if (children[mid].nodeChar > nextChar)
+          max = mid - 1;
+      }
+
+      if (min == max)
+        if (children[min].nodeChar == nextChar)
+          return children[min];
+
+      return null;
+    }
+
+    public int compareTo(TrieNode other) {
+      if (this.nodeChar < other.nodeChar)
+        return -1;
+      if (this.nodeChar == other.nodeChar)
+        return 0;
+      // if (this.nodeChar > other.nodeChar)
+      return 1;
+    }
+  }
+
+  /**
+   * Returns the next {@link TrieNode} visited, given that you are at
+   * <code>node</code>, and the the next character in the input is the
+   * <code>idx</code>'th character of <code>s</code>.
+   */
+  protected final TrieNode matchChar(TrieNode node, String s, int idx) {
+    return node.getChild(s.charAt(idx));
+  }
+
+  /**
+   * Adds any necessary nodes to the trie so that the given <code>String</code>
+   * can be decoded and the last character is represented by a terminal node.
+   * Zero-length <code>Strings</code> are ignored.
+   */
+  protected final void addPatternForward(String s) {
+    TrieNode node = root;
+    int stop = s.length() - 1;
+    int i;
+    if (s.length() > 0) {
+      for (i = 0; i < stop; i++)
+        node = node.getChildAddIfNotPresent(s.charAt(i), false);
+      node = node.getChildAddIfNotPresent(s.charAt(i), true);
+    }
+  }
+
+  /**
+   * Adds any necessary nodes to the trie so that the given <code>String</code>
+   * can be decoded <em>in reverse</em> and the first character is represented
+   * by a terminal node. Zero-length <code>Strings</code> are ignored.
+   */
+  protected final void addPatternBackward(String s) {
+    TrieNode node = root;
+    if (s.length() > 0) {
+      for (int i = s.length() - 1; i > 0; i--)
+        node = node.getChildAddIfNotPresent(s.charAt(i), false);
+      node = node.getChildAddIfNotPresent(s.charAt(0), true);
+    }
+  }
+
+  /**
+   * Returns true if the given <code>String</code> is matched by a pattern in
+   * the trie
+   */
+  public abstract boolean matches(String input);
+
+  /**
+   * Returns the shortest substring of <code>input<code> that is
+   * matched by a pattern in the trie, or <code>null<code> if no match
+   * exists.
+   */
+  public abstract String shortestMatch(String input);
+
+  /**
+   * Returns the longest substring of <code>input<code> that is
+   * matched by a pattern in the trie, or <code>null<code> if no match
+   * exists.
+   */
+  public abstract String longestMatch(String input);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
new file mode 100644
index 0000000..3e696cb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
@@ -0,0 +1,533 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.MalformedURLException;
+import java.net.*;
+import java.util.regex.Pattern;
+
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/** Utility class for URL analysis */
+public class URLUtil {
+
+  /**
+   * Resolve relative URL-s and fix a java.net.URL error in handling of URLs
+   * with pure query targets.
+   * 
+   * @param base
+   *          base url
+   * @param target
+   *          target url (may be relative)
+   * @return resolved absolute url.
+   * @throws MalformedURLException
+   */
+  public static URL resolveURL(URL base, String target)
+      throws MalformedURLException {
+    target = target.trim();
+
+    // handle the case that there is a target that is a pure query,
+    // for example
+    // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+    // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+    // default
+    // URL constructs the base+target combo as
+    // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+    // dropping the Search.aspx target
+    //
+    // Browsers handle these just fine, they must have an exception similar to
+    // this
+    if (target.startsWith("?")) {
+      return fixPureQueryTargets(base, target);
+    }
+
+    return new URL(base, target);
+  }
+
+  /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+  static URL fixPureQueryTargets(URL base, String target)
+      throws MalformedURLException {
+    if (!target.startsWith("?"))
+      return new URL(base, target);
+
+    String basePath = base.getPath();
+    String baseRightMost = "";
+    int baseRightMostIdx = basePath.lastIndexOf("/");
+    if (baseRightMostIdx != -1) {
+      baseRightMost = basePath.substring(baseRightMostIdx + 1);
+    }
+
+    if (target.startsWith("?"))
+      target = baseRightMost + target;
+
+    return new URL(base, target);
+  }
+
+  private static Pattern IP_PATTERN = Pattern
+      .compile("(\\d{1,3}\\.){3}(\\d{1,3})");
+
+  /**
+   * Returns the domain name of the url. The domain name of a url is the
+   * substring of the url's hostname, w/o subdomain names. As an example <br>
+   * <code>
+   *  getDomainName(conf, new URL(http://lucene.apache.org/))
+   *  </code><br>
+   * will return <br>
+   * <code> apache.org</code>
+   * */
+  public static String getDomainName(URL url) {
+    DomainSuffixes tlds = DomainSuffixes.getInstance();
+    String host = url.getHost();
+    // it seems that java returns hostnames ending with .
+    if (host.endsWith("."))
+      host = host.substring(0, host.length() - 1);
+    if (IP_PATTERN.matcher(host).matches())
+      return host;
+
+    int index = 0;
+    String candidate = host;
+    for (; index >= 0;) {
+      index = candidate.indexOf('.');
+      String subCandidate = candidate.substring(index + 1);
+      if (tlds.isDomainSuffix(subCandidate)) {
+        return candidate;
+      }
+      candidate = subCandidate;
+    }
+    return candidate;
+  }
+
+  /**
+   * Returns the domain name of the url. The domain name of a url is the
+   * substring of the url's hostname, w/o subdomain names. As an example <br>
+   * <code>
+   *  getDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   * will return <br>
+   * <code> apache.org</code>
+   * 
+   * @throws MalformedURLException
+   */
+  public static String getDomainName(String url) throws MalformedURLException {
+    return getDomainName(new URL(url));
+  }
+
+  /**
+   * Returns the top level domain name of the url. The top level domain name of
+   * a url is the substring of the url's hostname, w/o subdomain names. As an
+   * example <br>
+   * <code>
+   *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   * will return <br>
+   * <code> org</code>
+   * 
+   * @throws MalformedURLException
+   */
+  public static String getTopLevelDomainName(URL url)
+      throws MalformedURLException {
+    String suffix = getDomainSuffix(url).toString();
+    int idx = suffix.lastIndexOf(".");
+    if (idx != -1) {
+      return suffix.substring(idx + 1);
+    } else {
+      return suffix;
+    }
+  }
+
+  /**
+   * Returns the top level domain name of the url. The top level domain name of
+   * a url is the substring of the url's hostname, w/o subdomain names. As an
+   * example <br>
+   * <code>
+   *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   * will return <br>
+   * <code> org</code>
+   * 
+   * @throws MalformedURLException
+   */
+  public static String getTopLevelDomainName(String url)
+      throws MalformedURLException {
+    return getTopLevelDomainName(new URL(url));
+  }
+
+  /**
+   * Returns whether the given urls have the same domain name. As an example, <br>
+   * <code> isSameDomain(new URL("http://lucene.apache.org")
+   * , new URL("http://people.apache.org/"))
+   * <br> will return true. </code>
+   * 
+   * @return true if the domain names are equal
+   */
+  public static boolean isSameDomainName(URL url1, URL url2) {
+    return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
+  }
+
+  /**
+   * Returns whether the given urls have the same domain name. As an example, <br>
+   * <code> isSameDomain("http://lucene.apache.org"
+   * ,"http://people.apache.org/")
+   * <br> will return true. </code>
+   * 
+   * @return true if the domain names are equal
+   * @throws MalformedURLException
+   */
+  public static boolean isSameDomainName(String url1, String url2)
+      throws MalformedURLException {
+    return isSameDomainName(new URL(url1), new URL(url2));
+  }
+
+  /**
+   * Returns the {@link DomainSuffix} corresponding to the last public part of
+   * the hostname
+   */
+  public static DomainSuffix getDomainSuffix(URL url) {
+    DomainSuffixes tlds = DomainSuffixes.getInstance();
+    String host = url.getHost();
+    if (IP_PATTERN.matcher(host).matches())
+      return null;
+
+    int index = 0;
+    String candidate = host;
+    for (; index >= 0;) {
+      index = candidate.indexOf('.');
+      String subCandidate = candidate.substring(index + 1);
+      DomainSuffix d = tlds.get(subCandidate);
+      if (d != null) {
+        return d;
+      }
+      candidate = subCandidate;
+    }
+    return null;
+  }
+
+  /**
+   * Returns the {@link DomainSuffix} corresponding to the last public part of
+   * the hostname
+   */
+  public static DomainSuffix getDomainSuffix(String url)
+      throws MalformedURLException {
+    return getDomainSuffix(new URL(url));
+  }
+
+  /** Partitions of the hostname of the url by "." */
+  public static String[] getHostSegments(URL url) {
+    String host = url.getHost();
+    // return whole hostname, if it is an ipv4
+    // TODO : handle ipv6
+    if (IP_PATTERN.matcher(host).matches())
+      return new String[] { host };
+    return host.split("\\.");
+  }
+
+  /**
+   * Partitions of the hostname of the url by "."
+   * 
+   * @throws MalformedURLException
+   */
+  public static String[] getHostSegments(String url)
+      throws MalformedURLException {
+    return getHostSegments(new URL(url));
+  }
+
+  /**
+   * <p>
+   * Given two urls, a src and a destination of a redirect, it returns the
+   * representative url.
+   * <p>
+   * 
+   * <p>
+   * This method implements an extended version of the algorithm used by the
+   * Yahoo! Slurp crawler described here:<br>
+   * <a href=
+   * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How
+   * does the Yahoo! webcrawler handle redirects?</a> <br>
+   * <br>
+   * <ol>
+   * <li>Choose target url if either url is malformed.</li>
+   * <li>If different domains the keep the destination whether or not the
+   * redirect is temp or perm</li>
+   * <ul>
+   * <li>a.com -> b.com*</li>
+   * </ul>
+   * <li>If the redirect is permanent and the source is root, keep the source.</li>
+   * <ul>
+   * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is permanent and the source is not root and the
+   * destination is root, keep the destination</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com*</li>
+   * </ul>
+   * <li>If the redirect is permanent and neither the source nor the destination
+   * is root, then keep the destination</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+   * </ul>
+   * <li>If the redirect is temporary and source is root and destination is not
+   * root, then keep the source</li>
+   * <ul>
+   * <li>*a.com -> a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is temporary and source is not root and destination is
+   * root, then keep the destination</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com*</li>
+   * </ul>
+   * <li>If the redirect is temporary and neither the source or the destination
+   * is root, then keep the shortest url. First check for the shortest host, and
+   * if both are equal then check by path. Path is first by length then by the
+   * number of / path separators.</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+   * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is temporary and both the source and the destination
+   * are root, then keep the shortest sub-domain</li>
+   * <ul>
+   * <li>*www.a.com -> www.news.a.com</li>
+   * </ul>
+   * <br>
+   * While not in this logic there is a further piece of representative url
+   * logic that occurs during indexing and after scoring. During creation of the
+   * basic fields before indexing, if a url has a representative url stored we
+   * check both the url and its representative url (which should never be the
+   * same) against their linkrank scores and the highest scoring one is kept as
+   * the url and the lower scoring one is held as the orig url inside of the
+   * index.
+   * 
+   * @param src
+   *          The source url.
+   * @param dst
+   *          The destination url.
+   * @param temp
+   *          Is the redirect a temporary redirect.
+   * 
+   * @return String The representative url.
+   */
+  public static String chooseRepr(String src, String dst, boolean temp) {
+
+    // validate both are well formed urls
+    URL srcUrl;
+    URL dstUrl;
+    try {
+      srcUrl = new URL(src);
+      dstUrl = new URL(dst);
+    } catch (MalformedURLException e) {
+      return dst;
+    }
+
+    // get the source and destination domain, host, and page
+    String srcDomain = URLUtil.getDomainName(srcUrl);
+    String dstDomain = URLUtil.getDomainName(dstUrl);
+    String srcHost = srcUrl.getHost();
+    String dstHost = dstUrl.getHost();
+    String srcFile = srcUrl.getFile();
+    String dstFile = dstUrl.getFile();
+
+    // are the source and destination the root path url.com/ or url.com
+    boolean srcRoot = (srcFile.equals("/") || srcFile.length() == 0);
+    boolean destRoot = (dstFile.equals("/") || dstFile.length() == 0);
+
+    // 1) different domain them keep dest, temp or perm
+    // a.com -> b.com*
+    //
+    // 2) permanent and root, keep src
+    // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
+    //
+    // 3) permanent and not root and dest root, keep dest
+    // a.com/xyz/index.html -> a.com*
+    //
+    // 4) permanent and neither root keep dest
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    //
+    // 5) temp and root and dest not root keep src
+    // *a.com -> a.com/xyz/index.html
+    //
+    // 7) temp and not root and dest root keep dest
+    // a.com/xyz/index.html -> a.com*
+    //
+    // 8) temp and neither root, keep shortest, if hosts equal by path else by
+    // hosts. paths are first by length then by number of / separators
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
+    //
+    // 9) temp and both root keep shortest sub domain
+    // *www.a.com -> www.news.a.com
+
+    // if we are dealing with a redirect from one domain to another keep the
+    // destination
+    if (!srcDomain.equals(dstDomain)) {
+      return dst;
+    }
+
+    // if it is a permanent redirect
+    if (!temp) {
+
+      // if source is root return source, otherwise destination
+      if (srcRoot) {
+        return src;
+      } else {
+        return dst;
+      }
+    } else { // temporary redirect
+
+      // source root and destination not root
+      if (srcRoot && !destRoot) {
+        return src;
+      } else if (!srcRoot && destRoot) { // destination root and source not
+        return dst;
+      } else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {
+
+        // source and destination hosts are the same, check paths, host length
+        int numSrcPaths = srcFile.split("/").length;
+        int numDstPaths = dstFile.split("/").length;
+        if (numSrcPaths != numDstPaths) {
+          return (numDstPaths < numSrcPaths ? dst : src);
+        } else {
+          int srcPathLength = srcFile.length();
+          int dstPathLength = dstFile.length();
+          return (dstPathLength < srcPathLength ? dst : src);
+        }
+      } else {
+
+        // different host names and both root take the shortest
+        int numSrcSubs = srcHost.split("\\.").length;
+        int numDstSubs = dstHost.split("\\.").length;
+        return (numDstSubs < numSrcSubs ? dst : src);
+      }
+    }
+  }
+
+  /**
+   * Returns the lowercased hostname for the url or null if the url is not well
+   * formed.
+   * 
+   * @param url
+   *          The url to check.
+   * @return String The hostname for the url.
+   */
+  public static String getHost(String url) {
+    try {
+      return new URL(url).getHost().toLowerCase();
+    } catch (MalformedURLException e) {
+      return null;
+    }
+  }
+
+  /**
+   * Returns the page for the url. The page consists of the protocol, host, and
+   * path, but does not include the query string. The host is lowercased but the
+   * path is not.
+   * 
+   * @param url
+   *          The url to check.
+   * @return String The page for the url.
+   */
+  public static String getPage(String url) {
+    try {
+      // get the full url, and replace the query string with and empty string
+      url = url.toLowerCase();
+      String queryStr = new URL(url).getQuery();
+      return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
+    } catch (MalformedURLException e) {
+      return null;
+    }
+  }
+
+  public static String getProtocol(String url) {
+    try {
+      return getProtocol(new URL(url));
+    } catch (Exception e) {
+      return null;
+    }
+  }
+
+  public static String getProtocol(URL url) {
+    return url.getProtocol();
+  }
+
+  public static String toASCII(String url) {
+    try {
+      URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
+      URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
+          u.getPort(), u.getPath(), u.getQuery(), u.getRef());
+
+      return p.toString();
+    } catch (Exception e) {
+      return null;
+    }
+  }
+
+  public static String toUNICODE(String url) {
+    try {
+      URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
+      StringBuilder sb = new StringBuilder();
+      sb.append(u.getProtocol());
+      sb.append("://");
+      if (u.getUserInfo() != null) {
+        sb.append(u.getUserInfo());
+        sb.append('@');
+      }
+      sb.append(IDN.toUnicode(host));
+      if (u.getPort() != -1) {
+        sb.append(':');
+        sb.append(u.getPort());
+      }
+      sb.append(u.getFile()); // includes query
+      if (u.getRef() != null) {
+        sb.append('#');
+        sb.append(u.getRef());
+      }
+
+      return sb.toString();
+    } catch (Exception e) {
+      return null;
+    }
+  }
+
+  /** For testing */
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
+      System.err.println("Usage : URLUtil <url>");
+      return;
+    }
+
+    String url = args[0];
+    try {
+      System.out.println(URLUtil.getDomainName(new URL(url)));
+    } catch (MalformedURLException ex) {
+      ex.printStackTrace();
+    }
+  }
+}