You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:49 UTC
[33/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
new file mode 100644
index 0000000..a73187b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
@@ -0,0 +1,279 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+// JDK imports
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Tika imports
+import org.apache.tika.Tika;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput;
+
+/**
+ * @author mattmann
+ * @since NUTCH-608
+ *
+ * <p>
+ * This is a facade class to insulate Nutch from its underlying Mime Type
+ * substrate library, <a href="http://incubator.apache.org/tika/">Apache
+ * Tika</a>. Any mime handling code should be placed in this utility
+ * class, and hidden from the Nutch classes that rely on it.
+ * </p>
+ */
+public final class MimeUtil {
+
+ private static final String SEPARATOR = ";";
+
+ /* our Tika mime type registry */
+ private MimeTypes mimeTypes;
+
+ /* the tika detectors */
+ private Tika tika;
+
+ /* whether or not magic should be employed or not */
+ private boolean mimeMagic;
+
+ /* our log stream */
+ private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class
+ .getName());
+
+ public MimeUtil(Configuration conf) {
+ tika = new Tika();
+ ObjectCache objectCache = ObjectCache.get(conf);
+ MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
+ .getName());
+ if (mimeTypez == null) {
+ try {
+ String customMimeTypeFile = conf.get("mime.types.file");
+ if (customMimeTypeFile != null
+ && customMimeTypeFile.equals("") == false) {
+ try {
+ LOG.info("Using custom mime.types.file: {}", customMimeTypeFile);
+ mimeTypez = MimeTypesFactory.create(conf
+ .getConfResourceAsInputStream(customMimeTypeFile));
+ } catch (Exception e) {
+ LOG.error("Can't load mime.types.file : " + customMimeTypeFile
+ + " using Tika's default");
+ }
+ }
+ if (mimeTypez == null)
+ mimeTypez = MimeTypes.getDefaultMimeTypes();
+ } catch (Exception e) {
+ LOG.error("Exception in MimeUtil " + e.getMessage());
+ throw new RuntimeException(e);
+ }
+ objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
+ }
+
+ this.mimeTypes = mimeTypez;
+ this.mimeMagic = conf.getBoolean("mime.type.magic", true);
+ }
+
+ /**
+ * Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
+ * from a string of the form:
+ *
+ * <pre>
+ * <primary type>/<sub type> ; < optional params
+ * </pre>
+ *
+ * @param origType
+ * The original mime type string to be cleaned.
+ * @return The primary type, and subtype, concatenated, e.g., the actual mime
+ * type.
+ */
+ public static String cleanMimeType(String origType) {
+ if (origType == null)
+ return null;
+
+ // take the origType and split it on ';'
+ String[] tokenizedMimeType = origType.split(SEPARATOR);
+ if (tokenizedMimeType.length > 1) {
+ // there was a ';' in there, take the first value
+ return tokenizedMimeType[0];
+ } else {
+ // there wasn't a ';', so just return the orig type
+ return origType;
+ }
+ }
+
+ /**
+ * A facade interface to trying all the possible mime type resolution
+ * strategies available within Tika. First, the mime type provided in
+ * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then
+ * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes}
+ * registry, by its cleaned name. If the {@link MimeType} is found, then that
+ * mime type is used, otherwise URL resolution is used to try and determine
+ * the mime type. However, if <code>mime.type.magic</code> is enabled in
+ * {@link NutchConfiguration}, then mime type magic resolution is used to try
+ * and obtain a better-than-the-default approximation of the {@link MimeType}.
+ *
+ * @param typeName
+ * The original mime type, returned from a {@link ProtocolOutput}.
+ * @param url
+ * The given @see url, that Nutch was trying to crawl.
+ * @param data
+ * The byte data, returned from the crawl, if any.
+ * @return The correctly, automatically guessed {@link MimeType} name.
+ */
+ public String autoResolveContentType(String typeName, String url, byte[] data) {
+ String retType = null;
+ MimeType type = null;
+ String cleanedMimeType = null;
+
+ cleanedMimeType = MimeUtil.cleanMimeType(typeName);
+ // first try to get the type from the cleaned type name
+ if (cleanedMimeType != null) {
+ try {
+ type = mimeTypes.forName(cleanedMimeType);
+ cleanedMimeType = type.getName();
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ cleanedMimeType = null;
+ }
+ }
+
+ // if returned null, or if it's the default type then try url resolution
+ if (type == null
+ || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
+ // If no mime-type header, or cannot find a corresponding registered
+ // mime-type, then guess a mime-type from the url pattern
+ try {
+ retType = tika.detect(url) != null ? tika.detect(url) : null;
+ } catch (Exception e) {
+ String message = "Problem loading default Tika configuration";
+ LOG.error(message, e);
+ throw new RuntimeException(e);
+ }
+ } else {
+ retType = type.getName();
+ }
+
+ // if magic is enabled use mime magic to guess if the mime type returned
+ // from the magic guess is different than the one that's already set so far
+ // if it is, and it's not the default mime type, then go with the mime type
+ // returned by the magic
+ if (this.mimeMagic) {
+ String magicType = null;
+ // pass URL (file name) and (cleansed) content type from protocol to Tika
+ Metadata tikaMeta = new Metadata();
+ tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url);
+ tikaMeta.add(Metadata.CONTENT_TYPE,
+ (cleanedMimeType != null ? cleanedMimeType : typeName));
+ try {
+ InputStream stream = TikaInputStream.get(data);
+ try {
+ magicType = mimeTypes.detect(stream, tikaMeta).toString();
+ } finally {
+ stream.close();
+ }
+ } catch (IOException ignore) {
+ }
+
+ if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+ && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null
+ && !retType.equals(magicType)) {
+
+ // If magic enabled and the current mime type differs from that of the
+ // one returned from the magic, take the magic mimeType
+ retType = magicType;
+ }
+
+ // if type is STILL null after all the resolution strategies, go for the
+ // default type
+ if (retType == null) {
+ try {
+ retType = MimeTypes.OCTET_STREAM;
+ } catch (Exception ignore) {
+ }
+ }
+ }
+
+ return retType;
+ }
+
+ /**
+ * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
+ * method.
+ *
+ * @param url
+ * A string representation of the document {@link URL} to sense the
+ * {@link MimeType} for.
+ * @return An appropriate {@link MimeType}, identified from the given Document
+ * url in string form.
+ */
+ public String getMimeType(String url) {
+ return tika.detect(url);
+ }
+
+ /**
+ * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
+ * method.
+ *
+ * @param name
+ * The name of a valid {@link MimeType} in the Tika mime registry.
+ * @return The object representation of the {@link MimeType}, if it exists, or
+ * null otherwise.
+ */
+ public String forName(String name) {
+ try {
+ return this.mimeTypes.forName(name).toString();
+ } catch (MimeTypeException e) {
+ LOG.error("Exception getting mime type by name: [" + name
+ + "]: Message: " + e.getMessage());
+ return null;
+ }
+ }
+
+ /**
+ * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
+ * method.
+ *
+ * @param f
+ * The {@link File} to sense the {@link MimeType} for.
+ * @return The {@link MimeType} of the given {@link File}, or null if it
+ * cannot be determined.
+ */
+ public String getMimeType(File f) {
+ try {
+ return tika.detect(f);
+ } catch (Exception e) {
+ LOG.error("Exception getting mime type for file: [" + f.getPath()
+ + "]: Message: " + e.getMessage());
+ return null;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
new file mode 100644
index 0000000..c99bae0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.Stack;
+
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * <p>
+ * A utility class that allows the walking of any DOM tree using a stack instead
+ * of recursion. As the node tree is walked the next node is popped off of the
+ * stack and all of its children are automatically added to the stack to be
+ * called in tree order.
+ * </p>
+ *
+ * <p>
+ * Currently this class is not thread safe. It is assumed that only one thread
+ * will be accessing the <code>NodeWalker</code> at any given time.
+ * </p>
+ */
+public class NodeWalker {
+
+ // the root node the the stack holding the nodes
+ private Node currentNode;
+ private NodeList currentChildren;
+ private Stack<Node> nodes;
+
+ /**
+ * Starts the <code>Node</code> tree from the root node.
+ *
+ * @param rootNode
+ */
+ public NodeWalker(Node rootNode) {
+
+ nodes = new Stack<Node>();
+ nodes.add(rootNode);
+ }
+
+ /**
+ * <p>
+ * Returns the next <code>Node</code> on the stack and pushes all of its
+ * children onto the stack, allowing us to walk the node tree without the use
+ * of recursion. If there are no more nodes on the stack then null is
+ * returned.
+ * </p>
+ *
+ * @return Node The next <code>Node</code> on the stack or null if there isn't
+ * a next node.
+ */
+ public Node nextNode() {
+
+ // if no next node return null
+ if (!hasNext()) {
+ return null;
+ }
+
+ // pop the next node off of the stack and push all of its children onto
+ // the stack
+ currentNode = nodes.pop();
+ currentChildren = currentNode.getChildNodes();
+ int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
+
+ // put the children node on the stack in first to last order
+ for (int i = childLen - 1; i >= 0; i--) {
+ nodes.add(currentChildren.item(i));
+ }
+
+ return currentNode;
+ }
+
+ /**
+ * <p>
+ * Skips over and removes from the node stack the children of the last node.
+ * When getting a next node from the walker, that node's children are
+ * automatically added to the stack. You can call this method to remove those
+ * children from the stack.
+ * </p>
+ *
+ * <p>
+ * This is useful when you don't want to process deeper into the current path
+ * of the node tree but you want to continue processing sibling nodes.
+ * </p>
+ *
+ */
+ public void skipChildren() {
+
+ int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
+
+ for (int i = 0; i < childLen; i++) {
+ Node child = nodes.peek();
+ if (child.equals(currentChildren.item(i))) {
+ nodes.pop();
+ }
+ }
+ }
+
+ /**
+ * Return the current node.
+ *
+ * @return Node
+ */
+ public Node getCurrentNode() {
+ return currentNode;
+ }
+
+ /**
+ * @return returns true if there are more nodes on the current stack.
+ *
+ */
+ public boolean hasNext() {
+ return (nodes.size() > 0);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
new file mode 100644
index 0000000..ac71a93
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.UUID;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Utility to create Hadoop {@link Configuration}s that include Nutch-specific
+ * resources.
+ */
+public class NutchConfiguration {
+ public static final String UUID_KEY = "nutch.conf.uuid";
+
+ private NutchConfiguration() {
+ } // singleton
+
+ /*
+ * Configuration.hashCode() doesn't return values that correspond to a unique
+ * set of parameters. This is a workaround so that we can track instances of
+ * Configuration created by Nutch.
+ */
+ private static void setUUID(Configuration conf) {
+ UUID uuid = UUID.randomUUID();
+ conf.set(UUID_KEY, uuid.toString());
+ }
+
+ /**
+ * Retrieve a Nutch UUID of this configuration object, or null if the
+ * configuration was created elsewhere.
+ *
+ * @param conf
+ * configuration instance
+ * @return uuid or null
+ */
+ public static String getUUID(Configuration conf) {
+ return conf.get(UUID_KEY);
+ }
+
+ /**
+ * Create a {@link Configuration} for Nutch. This will load the standard Nutch
+ * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code>
+ * overrides.
+ */
+ public static Configuration create() {
+ Configuration conf = new Configuration();
+ setUUID(conf);
+ addNutchResources(conf);
+ return conf;
+ }
+
+ /**
+ * Create a {@link Configuration} from supplied properties.
+ *
+ * @param addNutchResources
+ * if true, then first <code>nutch-default.xml</code>, and then
+ * <code>nutch-site.xml</code> will be loaded prior to applying the
+ * properties. Otherwise these resources won't be used.
+ * @param nutchProperties
+ * a set of properties to define (or override)
+ */
+ public static Configuration create(boolean addNutchResources,
+ Properties nutchProperties) {
+ Configuration conf = new Configuration();
+ setUUID(conf);
+ if (addNutchResources) {
+ addNutchResources(conf);
+ }
+ for (Entry<Object, Object> e : nutchProperties.entrySet()) {
+ conf.set(e.getKey().toString(), e.getValue().toString());
+ }
+ return conf;
+ }
+
+ /**
+ * Add the standard Nutch resources to {@link Configuration}.
+ *
+ * @param conf
+ * Configuration object to which configuration is to be added.
+ */
+ private static Configuration addNutchResources(Configuration conf) {
+ conf.addResource("nutch-default.xml");
+ conf.addResource("nutch-site.xml");
+ return conf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
new file mode 100644
index 0000000..8b4f8e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+/** A {@link JobConf} for Nutch jobs. */
+public class NutchJob extends JobConf {
+
+ public NutchJob(Configuration conf) {
+ super(conf, NutchJob.class);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
new file mode 100644
index 0000000..8e75177
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.nutch.metadata.Nutch;
+
+public abstract class NutchTool extends Configured {
+
+ protected HashMap<String, Object> results = new HashMap<String, Object>();
+ protected Map<String, Object> status = Collections
+ .synchronizedMap(new HashMap<String, Object>());
+ protected Job currentJob;
+ protected int numJobs;
+ protected int currentJobNum;
+
+ /**
+ * Runs the tool, using a map of arguments. May return results, or null.
+ */
+ public abstract Map<String, Object> run(Map<String, Object> args, String crawlId)
+ throws Exception;
+
+ public NutchTool(Configuration conf){
+ super(conf);
+ }
+
+ public NutchTool(){
+ super(null);
+ }
+
+ /** Returns relative progress of the tool, a float in range [0,1]. */
+ public float getProgress() {
+ float res = 0;
+ if (currentJob != null) {
+ try {
+ res = (currentJob.mapProgress() + currentJob.reduceProgress()) / 2.0f;
+ } catch (IOException e) {
+ e.printStackTrace();
+ res = 0;
+ } catch (IllegalStateException ile) {
+ ile.printStackTrace();
+ res = 0;
+ }
+ }
+ // take into account multiple jobs
+ if (numJobs > 1) {
+ res = (currentJobNum + res) / (float) numJobs;
+ }
+ status.put(Nutch.STAT_PROGRESS, res);
+ return res;
+ }
+
+ /** Returns current status of the running tool. */
+ public Map<String, Object> getStatus() {
+ return status;
+ }
+
+ /**
+ * Stop the job with the possibility to resume. Subclasses should override
+ * this, since by default it calls {@link #killJob()}.
+ *
+ * @return true if succeeded, false otherwise
+ */
+ public boolean stopJob() throws Exception {
+ return killJob();
+ }
+
+ /**
+ * Kill the job immediately. Clients should assume that any results that the
+ * job produced so far are in inconsistent state or missing.
+ *
+ * @return true if succeeded, false otherwise.
+ * @throws Exception
+ */
+ public boolean killJob() throws Exception {
+ if (currentJob != null && !currentJob.isComplete()) {
+ try {
+ currentJob.killJob();
+ return true;
+ } catch (Exception e) {
+ e.printStackTrace();
+ return false;
+ }
+ }
+ return false;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
new file mode 100644
index 0000000..0277ee6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.HashMap;
+import java.util.WeakHashMap;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class ObjectCache {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class);
+
+ private static final WeakHashMap<Configuration, ObjectCache> CACHE = new WeakHashMap<Configuration, ObjectCache>();
+
+ private final HashMap<String, Object> objectMap;
+
+ private ObjectCache() {
+ objectMap = new HashMap<String, Object>();
+ }
+
+ public synchronized static ObjectCache get(Configuration conf) {
+ ObjectCache objectCache = CACHE.get(conf);
+ if (objectCache == null) {
+ LOG.debug("No object cache found for conf=" + conf
+ + ", instantiating a new object cache");
+ objectCache = new ObjectCache();
+ CACHE.put(conf, objectCache);
+ }
+ return objectCache;
+ }
+
+ public synchronized Object getObject(String key) {
+ return objectMap.get(key);
+ }
+
+ public synchronized void setObject(String key, Object value) {
+ objectMap.put(key, value);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
new file mode 100644
index 0000000..e323b67
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A class for efficiently matching <code>String</code>s against a set of
+ * prefixes.
+ */
+public class PrefixStringMatcher extends TrieStringMatcher {
+
+ /**
+ * Creates a new <code>PrefixStringMatcher</code> which will match
+ * <code>String</code>s with any prefix in the supplied array. Zero-length
+ * <code>Strings</code> are ignored.
+ */
+ public PrefixStringMatcher(String[] prefixes) {
+ super();
+ for (int i = 0; i < prefixes.length; i++)
+ addPatternForward(prefixes[i]);
+ }
+
+ /**
+ * Creates a new <code>PrefixStringMatcher</code> which will match
+ * <code>String</code>s with any prefix in the supplied
+ * <code>Collection</code>.
+ *
+ * @throws ClassCastException
+ * if any <code>Object</code>s in the collection are not
+ * <code>String</code>s
+ */
+ public PrefixStringMatcher(Collection<String> prefixes) {
+ super();
+ Iterator<String> iter = prefixes.iterator();
+ while (iter.hasNext())
+ addPatternForward(iter.next());
+ }
+
+ /**
+ * Returns true if the given <code>String</code> is matched by a prefix in the
+ * trie
+ */
+ public boolean matches(String input) {
+ TrieNode node = root;
+ for (int i = 0; i < input.length(); i++) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
+ return false;
+ if (node.isTerminal())
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Returns the shortest prefix of <code>input<code> that is matched,
+ * or <code>null<code> if no match exists.
+ */
+ public String shortestMatch(String input) {
+ TrieNode node = root;
+ for (int i = 0; i < input.length(); i++) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
+ return null;
+ if (node.isTerminal())
+ return input.substring(0, i + 1);
+ }
+ return null;
+ }
+
+ /**
+ * Returns the longest prefix of <code>input<code> that is matched,
+ * or <code>null<code> if no match exists.
+ */
+ public String longestMatch(String input) {
+ TrieNode node = root;
+ String result = null;
+ for (int i = 0; i < input.length(); i++) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
+ break;
+ if (node.isTerminal())
+ result = input.substring(0, i + 1);
+ }
+ return result;
+ }
+
+ public static final void main(String[] argv) {
+ PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
+ "abcd", "abc", "aac", "baz", "foo", "foobar" });
+
+ String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
+ "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+ for (int i = 0; i < tests.length; i++) {
+ System.out.println("testing: " + tests[i]);
+ System.out.println(" matches: " + matcher.matches(tests[i]));
+ System.out.println(" shortest: " + matcher.shortestMatch(tests[i]));
+ System.out.println(" longest: " + matcher.longestMatch(tests[i]));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
new file mode 100644
index 0000000..d26cbfc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.metadata.Nutch;
+
+/**
+ * Extracts protocol status code information from the crawl database.
+ *
+ * ProtocolStatusStatistics will give you information on the count
+ * of all status codes encountered on your crawl. This can be useful
+ * for checking a number of things.
+ *
+ * An example output run showing the number of encountered status
+ * codes such as 200, 300, and a count of un-fetched record.
+ *
+ * 38 200
+ * 19 301
+ * 2 302
+ * 665 UNFETCHED
+ *
+ */
+public class ProtocolStatusStatistics extends Configured implements Tool {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(ProtocolStatusStatistics.class);
+
+ private static final Text UNFETCHED_TEXT = new Text("UNFETCHED");
+
+ public static Configuration conf;
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.err.println("Usage: ProtocolStatistics inputDirs outDir [numOfReducer]");
+
+ System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
+ System.err.println("\t\t\tE.g.: crawl/crawldb/");
+
+ System.err.println("\toutDir\t\tOutput directory where results should be dumped");
+
+ System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+ return 1;
+ }
+ String inputDir = args[0];
+ String outputDir = args[1];
+
+ int numOfReducers = 1;
+
+ if (args.length > 3) {
+ numOfReducers = Integer.parseInt(args[3]);
+ }
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+
+ String jobName = "ProtocolStatistics";
+
+ conf = getConf();
+ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+ Job job = Job.getInstance(conf, jobName);
+ job.setJarByClass(ProtocolStatusStatistics.class);
+
+ String[] inputDirsSpecs = inputDir.split(",");
+ for (int i = 0; i < inputDirsSpecs.length; i++) {
+ File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+ FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+ }
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ FileOutputFormat.setOutputPath(job, new Path(outputDir));
+ job.setOutputFormatClass(TextOutputFormat.class);
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(LongWritable.class);
+
+ job.setMapperClass(ProtocolStatusStatisticsMapper.class);
+ job.setReducerClass(ProtocolStatusStatisticsReducer.class);
+ job.setCombinerClass(ProtocolStatusStatisticsCombiner.class);
+ job.setNumReduceTasks(numOfReducers);
+
+ try {
+ job.waitForCompletion(true);
+ } catch (Exception e) {
+ throw e;
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
+ return 0;
+ }
+
+ static class ProtocolStatusStatisticsMapper extends
+ Mapper<Text, CrawlDatum, Text, LongWritable> {
+
+ public void map(Text urlText, CrawlDatum datum, Context context)
+ throws IOException, InterruptedException {
+ if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+ context.write((Text) datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY), new LongWritable(1));
+ } else {
+ context.write(UNFETCHED_TEXT, new LongWritable(1));
+ }
+ }
+ }
+
+ static class ProtocolStatusStatisticsReducer extends
+ Reducer<Text, LongWritable, LongWritable, Text> {
+ public void reduce(Text key, Iterable<LongWritable> values, Context context)
+ throws IOException, InterruptedException {
+ long total = 0;
+
+ for (LongWritable val : values) {
+ total += val.get();
+ }
+
+ context.write(new LongWritable(total), key);
+ }
+ }
+
+ public static class ProtocolStatusStatisticsCombiner extends
+ Reducer<Text, LongWritable, Text, LongWritable> {
+ public void reduce(Text key, Iterable<LongWritable> values, Context context)
+ throws IOException, InterruptedException {
+ long total = 0;
+
+ for (LongWritable val : values) {
+ total += val.get();
+ }
+ context.write(key, new LongWritable(total));
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(NutchConfiguration.create(), new ProtocolStatusStatistics(), args);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
new file mode 100644
index 0000000..149269f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+/**
+ * A collection of String processing utility methods.
+ */
+public class StringUtil {
+
+ /**
+ * Returns a copy of <code>s</code> padded with trailing spaces so that it's
+ * length is <code>length</code>. Strings already <code>length</code>
+ * characters long or longer are not altered.
+ */
+ public static String rightPad(String s, int length) {
+ StringBuffer sb = new StringBuffer(s);
+ for (int i = length - s.length(); i > 0; i--)
+ sb.append(" ");
+ return sb.toString();
+ }
+
+ /**
+ * Returns a copy of <code>s</code> padded with leading spaces so that it's
+ * length is <code>length</code>. Strings already <code>length</code>
+ * characters long or longer are not altered.
+ */
+ public static String leftPad(String s, int length) {
+ StringBuffer sb = new StringBuffer();
+ for (int i = length - s.length(); i > 0; i--)
+ sb.append(" ");
+ sb.append(s);
+ return sb.toString();
+ }
+
+ private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6',
+ '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+ /**
+ * Convenience call for {@link #toHexString(byte[], String, int)}, where
+ * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+ *
+ * @param buf
+ */
+ public static String toHexString(byte[] buf) {
+ return toHexString(buf, null, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Get a text representation of a byte[] as hexadecimal String, where each
+ * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+ *
+ * @param buf
+ * input data
+ * @param sep
+ * separate every pair of hexadecimal digits with this separator, or
+ * null if no separation is needed.
+ * @param lineLen
+ * break the output String into lines containing output for lineLen
+ * bytes.
+ */
+ public static String toHexString(byte[] buf, String sep, int lineLen) {
+ if (buf == null)
+ return null;
+ if (lineLen <= 0)
+ lineLen = Integer.MAX_VALUE;
+ StringBuffer res = new StringBuffer(buf.length * 2);
+ for (int i = 0; i < buf.length; i++) {
+ int b = buf[i];
+ res.append(HEX_DIGITS[(b >> 4) & 0xf]);
+ res.append(HEX_DIGITS[b & 0xf]);
+ if (i > 0 && (i % lineLen) == 0)
+ res.append('\n');
+ else if (sep != null && i < lineLen - 1)
+ res.append(sep);
+ }
+ return res.toString();
+ }
+
+ /**
+ * Convert a String containing consecutive (no inside whitespace) hexadecimal
+ * digits into a corresponding byte array. If the number of digits is not
+ * even, a '0' will be appended in the front of the String prior to
+ * conversion. Leading and trailing whitespace is ignored.
+ *
+ * @param text
+ * input text
+ * @return converted byte array, or null if unable to convert
+ */
+ public static byte[] fromHexString(String text) {
+ text = text.trim();
+ if (text.length() % 2 != 0)
+ text = "0" + text;
+ int resLen = text.length() / 2;
+ int loNibble, hiNibble;
+ byte[] res = new byte[resLen];
+ for (int i = 0; i < resLen; i++) {
+ int j = i << 1;
+ hiNibble = charToNibble(text.charAt(j));
+ loNibble = charToNibble(text.charAt(j + 1));
+ if (loNibble == -1 || hiNibble == -1)
+ return null;
+ res[i] = (byte) (hiNibble << 4 | loNibble);
+ }
+ return res;
+ }
+
+ private static final int charToNibble(char c) {
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ } else if (c >= 'a' && c <= 'f') {
+ return 0xa + (c - 'a');
+ } else if (c >= 'A' && c <= 'F') {
+ return 0xA + (c - 'A');
+ } else {
+ return -1;
+ }
+ }
+
+ /**
+ * Checks if a string is empty (ie is null or empty).
+ */
+ public static boolean isEmpty(String str) {
+ return (str == null) || (str.equals(""));
+ }
+
+ /**
+ * Simple character substitution which cleans all \ufffd chars from a given String.
+ */
+ public static String cleanField(String value) {
+ return value.replaceAll("\ufffd", "");
+ }
+
+ public static void main(String[] args) {
+ if (args.length != 1)
+ System.out.println("Usage: StringUtil <encoding name>");
+ else
+ System.out.println(args[0] + " is resolved to "
+ + EncodingDetector.resolveEncodingAlias(args[0]));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
new file mode 100644
index 0000000..a967c01
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A class for efficiently matching <code>String</code>s against a set of
+ * suffixes. Zero-length <code>Strings</code> are ignored.
+ */
+public class SuffixStringMatcher extends TrieStringMatcher {
+
+ /**
+ * Creates a new <code>PrefixStringMatcher</code> which will match
+ * <code>String</code>s with any suffix in the supplied array.
+ */
+ public SuffixStringMatcher(String[] suffixes) {
+ super();
+ for (int i = 0; i < suffixes.length; i++)
+ addPatternBackward(suffixes[i]);
+ }
+
+ /**
+ * Creates a new <code>PrefixStringMatcher</code> which will match
+ * <code>String</code>s with any suffix in the supplied
+ * <code>Collection</code>
+ */
+ public SuffixStringMatcher(Collection<String> suffixes) {
+ super();
+ Iterator<String> iter = suffixes.iterator();
+ while (iter.hasNext())
+ addPatternBackward(iter.next());
+ }
+
+ /**
+ * Returns true if the given <code>String</code> is matched by a suffix in the
+ * trie
+ */
+ public boolean matches(String input) {
+ TrieNode node = root;
+ for (int i = input.length() - 1; i >= 0; i--) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
+ return false;
+ if (node.isTerminal())
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Returns the shortest suffix of <code>input<code> that is matched,
+ * or <code>null<code> if no match exists.
+ */
+ public String shortestMatch(String input) {
+ TrieNode node = root;
+ for (int i = input.length() - 1; i >= 0; i--) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
+ return null;
+ if (node.isTerminal())
+ return input.substring(i);
+ }
+ return null;
+ }
+
+ /**
+ * Returns the longest suffix of <code>input<code> that is matched,
+ * or <code>null<code> if no match exists.
+ */
+ public String longestMatch(String input) {
+ TrieNode node = root;
+ String result = null;
+ for (int i = input.length() - 1; i >= 0; i--) {
+ node = node.getChild(input.charAt(i));
+ if (node == null)
+ break;
+ if (node.isTerminal())
+ result = input.substring(i);
+ }
+ return result;
+ }
+
+ public static final void main(String[] argv) {
+ SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a",
+ "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" });
+
+ String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
+ "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+ for (int i = 0; i < tests.length; i++) {
+ System.out.println("testing: " + tests[i]);
+ System.out.println(" matches: " + matcher.matches(tests[i]));
+ System.out.println(" shortest: " + matcher.shortestMatch(tests[i]));
+ System.out.println(" longest: " + matcher.longestMatch(tests[i]));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
new file mode 100644
index 0000000..68ded69
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.util;
+
+import org.apache.commons.lang.StringUtils;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+
+public class TableUtil {
+
+ public static final ByteBuffer YES_VAL = ByteBuffer.wrap(new byte[] { 'y' });
+
+ /**
+ * Reverses a url's domain. This form is better for storing in hbase. Because
+ * scans within the same domain are faster.
+ * <p>
+ * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
+ * "com.foo.bar:8983:http/to/index.html?a=b".
+ *
+ * @param url
+ * url to be reversed
+ * @return Reversed url
+ * @throws MalformedURLException
+ */
+ public static String reverseUrl(String urlString)
+ throws MalformedURLException {
+ return reverseUrl(new URL(urlString));
+ }
+
+ /**
+ * Reverses a url's domain. This form is better for storing in hbase. Because
+ * scans within the same domain are faster.
+ * <p>
+ * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
+ * "com.foo.bar:http:8983/to/index.html?a=b".
+ *
+ * @param url
+ * url to be reversed
+ * @return Reversed url
+ */
+ public static String reverseUrl(URL url) {
+ String host = url.getHost();
+ String file = url.getFile();
+ String protocol = url.getProtocol();
+ int port = url.getPort();
+
+ StringBuilder buf = new StringBuilder();
+
+ /* reverse host */
+ reverseAppendSplits(host, buf);
+
+ /* add protocol */
+ buf.append(':');
+ buf.append(protocol);
+
+ /* add port if necessary */
+ if (port != -1) {
+ buf.append(':');
+ buf.append(port);
+ }
+
+ /* add path */
+ if (file.length() > 0 && '/' != file.charAt(0)) {
+ buf.append('/');
+ }
+ buf.append(file);
+
+ return buf.toString();
+ }
+
+ public static String unreverseUrl(String reversedUrl) {
+ StringBuilder buf = new StringBuilder(reversedUrl.length() + 2);
+
+ int pathBegin = reversedUrl.indexOf('/');
+ if (pathBegin == -1)
+ pathBegin = reversedUrl.length();
+ String sub = reversedUrl.substring(0, pathBegin);
+
+ String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed
+ // host>,
+ // <port>,
+ // <protocol>}
+
+ buf.append(splits[1]); // add protocol
+ buf.append("://");
+ reverseAppendSplits(splits[0], buf); // splits[0] is reversed
+ // host
+ if (splits.length == 3) { // has a port
+ buf.append(':');
+ buf.append(splits[2]);
+ }
+ buf.append(reversedUrl.substring(pathBegin));
+ return buf.toString();
+ }
+
+ /**
+ * Given a reversed url, returns the reversed host E.g
+ * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
+ *
+ * @param reversedUrl
+ * Reversed url
+ * @return Reversed host
+ */
+ public static String getReversedHost(String reversedUrl) {
+ return reversedUrl.substring(0, reversedUrl.indexOf(':'));
+ }
+
+ private static void reverseAppendSplits(String string, StringBuilder buf) {
+ String[] splits = StringUtils.split(string, '.');
+ if (splits.length > 0) {
+ for (int i = splits.length - 1; i > 0; i--) {
+ buf.append(splits[i]);
+ buf.append('.');
+ }
+ buf.append(splits[0]);
+ } else {
+ buf.append(string);
+ }
+ }
+
+ public static String reverseHost(String hostName) {
+ StringBuilder buf = new StringBuilder();
+ reverseAppendSplits(hostName, buf);
+ return buf.toString();
+
+ }
+
+ public static String unreverseHost(String reversedHostName) {
+ return reverseHost(reversedHostName); // Reversible
+ }
+
+ /**
+ * Convert given Utf8 instance to String and and cleans out any offending "\ufffd"
+ * from the String.
+ *
+ *
+ * @param utf8
+ * Utf8 object
+ * @return string-ifed Utf8 object or null if Utf8 instance is null
+ */
+ public static String toString(CharSequence utf8) {
+ return (utf8 == null ? null : StringUtil.cleanField(utf8.toString()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
new file mode 100644
index 0000000..c4af356
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.concurrent.TimeUnit;
+
+public class TimingUtil {
+
+ /**
+ * Calculate the elapsed time between two times specified in milliseconds.
+ *
+ * @param start
+ * The start of the time period
+ * @param end
+ * The end of the time period
+ * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
+ * minutes and Z seconds or null if start > end.
+ */
+ public static String elapsedTime(long start, long end) {
+ if (start > end) {
+ return null;
+ }
+ return secondsToHMS((end-start)/1000);
+ }
+
+ /**
+ * Show time in seconds as hours, minutes and seconds (hh:mm:ss)
+ *
+ * @param seconds
+ * (elapsed) time in seconds
+ * @return human readable time string "hh:mm:ss"
+ */
+ public static String secondsToHMS(long seconds) {
+ long hours = TimeUnit.SECONDS.toHours(seconds);
+ long minutes = TimeUnit.SECONDS.toMinutes(seconds)
+ % TimeUnit.HOURS.toMinutes(1);
+ seconds = TimeUnit.SECONDS.toSeconds(seconds)
+ % TimeUnit.MINUTES.toSeconds(1);
+ return String.format("%02d:%02d:%02d", hours, minutes, seconds);
+ }
+
+ /**
+ * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss)
+ *
+ * @param seconds
+ * (elapsed) time in seconds
+ * @return human readable time string "d days, hh:mm:ss"
+ */
+ public static String secondsToDaysHMS(long seconds) {
+ long days = TimeUnit.SECONDS.toDays(seconds);
+ if (days == 0)
+ return secondsToHMS(seconds);
+ String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1));
+ return String.format("%d days, %s", days, hhmmss);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
new file mode 100644
index 0000000..95f06ad
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -0,0 +1,202 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * TrieStringMatcher is a base class for simple tree-based string matching.
+ *
+ */
+public abstract class TrieStringMatcher {
+ protected TrieNode root;
+
+ protected TrieStringMatcher() {
+ this.root = new TrieNode('\000', false);
+ }
+
+ /**
+ * Node class for the character tree.
+ */
+ protected class TrieNode implements Comparable<TrieNode> {
+ protected TrieNode[] children;
+ protected LinkedList<TrieNode> childrenList;
+ protected char nodeChar;
+ protected boolean terminal;
+
+ /**
+ * Creates a new TrieNode, which contains the given <code>nodeChar</code>.
+ * If <code>isTerminal</code> is <code>true</code>, the new node is a
+ * <em>terminal</em> node in the trie.
+ */
+ TrieNode(char nodeChar, boolean isTerminal) {
+ this.nodeChar = nodeChar;
+ this.terminal = isTerminal;
+ this.childrenList = new LinkedList<TrieNode>();
+ }
+
+ /**
+ * Returns <code>true</code> if this node is a <em>terminal</em> node in the
+ * trie.
+ */
+ boolean isTerminal() {
+ return terminal;
+ }
+
+ /**
+ * Returns the child node of this node whose node-character is
+ * <code>nextChar</code>. If no such node exists, one will be is added. If
+ * <em>isTerminal</em> is <code>true</code>, the node will be a terminal
+ * node in the trie.
+ */
+ TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) {
+ if (childrenList == null) {
+ childrenList = new LinkedList<TrieNode>();
+ childrenList.addAll(Arrays.asList(children));
+ children = null;
+ }
+
+ if (childrenList.size() == 0) {
+ TrieNode newNode = new TrieNode(nextChar, isTerminal);
+ childrenList.add(newNode);
+ return newNode;
+ }
+
+ ListIterator<TrieNode> iter = childrenList.listIterator();
+ TrieNode node = iter.next();
+ while ((node.nodeChar < nextChar) && iter.hasNext())
+ node = iter.next();
+
+ if (node.nodeChar == nextChar) {
+ node.terminal = node.terminal | isTerminal;
+ return node;
+ }
+
+ if (node.nodeChar > nextChar)
+ iter.previous();
+
+ TrieNode newNode = new TrieNode(nextChar, isTerminal);
+ iter.add(newNode);
+ return newNode;
+ }
+
+ /**
+ * Returns the child node of this node whose node-character is
+ * <code>nextChar</code>. If no such node exists, <code>null</code> is
+ * returned.
+ */
+ TrieNode getChild(char nextChar) {
+ if (children == null) {
+ children = childrenList.toArray(new TrieNode[childrenList.size()]);
+ childrenList = null;
+ Arrays.sort(children);
+ }
+
+ int min = 0;
+ int max = children.length - 1;
+ int mid = 0;
+ while (min < max) {
+ mid = (min + max) / 2;
+ if (children[mid].nodeChar == nextChar)
+ return children[mid];
+ if (children[mid].nodeChar < nextChar)
+ min = mid + 1;
+ else
+ // if (children[mid].nodeChar > nextChar)
+ max = mid - 1;
+ }
+
+ if (min == max)
+ if (children[min].nodeChar == nextChar)
+ return children[min];
+
+ return null;
+ }
+
+ public int compareTo(TrieNode other) {
+ if (this.nodeChar < other.nodeChar)
+ return -1;
+ if (this.nodeChar == other.nodeChar)
+ return 0;
+ // if (this.nodeChar > other.nodeChar)
+ return 1;
+ }
+ }
+
+ /**
+ * Returns the next {@link TrieNode} visited, given that you are at
+ * <code>node</code>, and the the next character in the input is the
+ * <code>idx</code>'th character of <code>s</code>.
+ */
+ protected final TrieNode matchChar(TrieNode node, String s, int idx) {
+ return node.getChild(s.charAt(idx));
+ }
+
+ /**
+ * Adds any necessary nodes to the trie so that the given <code>String</code>
+ * can be decoded and the last character is represented by a terminal node.
+ * Zero-length <code>Strings</code> are ignored.
+ */
+ protected final void addPatternForward(String s) {
+ TrieNode node = root;
+ int stop = s.length() - 1;
+ int i;
+ if (s.length() > 0) {
+ for (i = 0; i < stop; i++)
+ node = node.getChildAddIfNotPresent(s.charAt(i), false);
+ node = node.getChildAddIfNotPresent(s.charAt(i), true);
+ }
+ }
+
+ /**
+ * Adds any necessary nodes to the trie so that the given <code>String</code>
+ * can be decoded <em>in reverse</em> and the first character is represented
+ * by a terminal node. Zero-length <code>Strings</code> are ignored.
+ */
+ protected final void addPatternBackward(String s) {
+ TrieNode node = root;
+ if (s.length() > 0) {
+ for (int i = s.length() - 1; i > 0; i--)
+ node = node.getChildAddIfNotPresent(s.charAt(i), false);
+ node = node.getChildAddIfNotPresent(s.charAt(0), true);
+ }
+ }
+
+ /**
+ * Returns true if the given <code>String</code> is matched by a pattern in
+ * the trie
+ */
+ public abstract boolean matches(String input);
+
+ /**
+ * Returns the shortest substring of <code>input<code> that is
+ * matched by a pattern in the trie, or <code>null<code> if no match
+ * exists.
+ */
+ public abstract String shortestMatch(String input);
+
+ /**
+ * Returns the longest substring of <code>input<code> that is
+ * matched by a pattern in the trie, or <code>null<code> if no match
+ * exists.
+ */
+ public abstract String longestMatch(String input);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
new file mode 100644
index 0000000..3e696cb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
@@ -0,0 +1,533 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.MalformedURLException;
+import java.net.*;
+import java.util.regex.Pattern;
+
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/** Utility class for URL analysis */
+public class URLUtil {
+
+ /**
+ * Resolve relative URL-s and fix a java.net.URL error in handling of URLs
+ * with pure query targets.
+ *
+ * @param base
+ * base url
+ * @param target
+ * target url (may be relative)
+ * @return resolved absolute url.
+ * @throws MalformedURLException
+ */
+ public static URL resolveURL(URL base, String target)
+ throws MalformedURLException {
+ target = target.trim();
+
+ // handle the case that there is a target that is a pure query,
+ // for example
+ // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+ // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+ // default
+ // URL constructs the base+target combo as
+ // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+ // dropping the Search.aspx target
+ //
+ // Browsers handle these just fine, they must have an exception similar to
+ // this
+ if (target.startsWith("?")) {
+ return fixPureQueryTargets(base, target);
+ }
+
+ return new URL(base, target);
+ }
+
+ /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+ static URL fixPureQueryTargets(URL base, String target)
+ throws MalformedURLException {
+ if (!target.startsWith("?"))
+ return new URL(base, target);
+
+ String basePath = base.getPath();
+ String baseRightMost = "";
+ int baseRightMostIdx = basePath.lastIndexOf("/");
+ if (baseRightMostIdx != -1) {
+ baseRightMost = basePath.substring(baseRightMostIdx + 1);
+ }
+
+ if (target.startsWith("?"))
+ target = baseRightMost + target;
+
+ return new URL(base, target);
+ }
+
+ private static Pattern IP_PATTERN = Pattern
+ .compile("(\\d{1,3}\\.){3}(\\d{1,3})");
+
+ /**
+ * Returns the domain name of the url. The domain name of a url is the
+ * substring of the url's hostname, w/o subdomain names. As an example <br>
+ * <code>
+ * getDomainName(conf, new URL(http://lucene.apache.org/))
+ * </code><br>
+ * will return <br>
+ * <code> apache.org</code>
+ * */
+ public static String getDomainName(URL url) {
+ DomainSuffixes tlds = DomainSuffixes.getInstance();
+ String host = url.getHost();
+ // it seems that java returns hostnames ending with .
+ if (host.endsWith("."))
+ host = host.substring(0, host.length() - 1);
+ if (IP_PATTERN.matcher(host).matches())
+ return host;
+
+ int index = 0;
+ String candidate = host;
+ for (; index >= 0;) {
+ index = candidate.indexOf('.');
+ String subCandidate = candidate.substring(index + 1);
+ if (tlds.isDomainSuffix(subCandidate)) {
+ return candidate;
+ }
+ candidate = subCandidate;
+ }
+ return candidate;
+ }
+
+ /**
+ * Returns the domain name of the url. The domain name of a url is the
+ * substring of the url's hostname, w/o subdomain names. As an example <br>
+ * <code>
+ * getDomainName(conf, new http://lucene.apache.org/)
+ * </code><br>
+ * will return <br>
+ * <code> apache.org</code>
+ *
+ * @throws MalformedURLException
+ */
+ public static String getDomainName(String url) throws MalformedURLException {
+ return getDomainName(new URL(url));
+ }
+
+ /**
+ * Returns the top level domain name of the url. The top level domain name of
+ * a url is the substring of the url's hostname, w/o subdomain names. As an
+ * example <br>
+ * <code>
+ * getTopLevelDomainName(conf, new http://lucene.apache.org/)
+ * </code><br>
+ * will return <br>
+ * <code> org</code>
+ *
+ * @throws MalformedURLException
+ */
+ public static String getTopLevelDomainName(URL url)
+ throws MalformedURLException {
+ String suffix = getDomainSuffix(url).toString();
+ int idx = suffix.lastIndexOf(".");
+ if (idx != -1) {
+ return suffix.substring(idx + 1);
+ } else {
+ return suffix;
+ }
+ }
+
+ /**
+ * Returns the top level domain name of the url. The top level domain name of
+ * a url is the substring of the url's hostname, w/o subdomain names. As an
+ * example <br>
+ * <code>
+ * getTopLevelDomainName(conf, new http://lucene.apache.org/)
+ * </code><br>
+ * will return <br>
+ * <code> org</code>
+ *
+ * @throws MalformedURLException
+ */
+ public static String getTopLevelDomainName(String url)
+ throws MalformedURLException {
+ return getTopLevelDomainName(new URL(url));
+ }
+
+ /**
+ * Returns whether the given urls have the same domain name. As an example, <br>
+ * <code> isSameDomain(new URL("http://lucene.apache.org")
+ * , new URL("http://people.apache.org/"))
+ * <br> will return true. </code>
+ *
+ * @return true if the domain names are equal
+ */
+ public static boolean isSameDomainName(URL url1, URL url2) {
+ return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
+ }
+
+ /**
+ * Returns whether the given urls have the same domain name. As an example, <br>
+ * <code> isSameDomain("http://lucene.apache.org"
+ * ,"http://people.apache.org/")
+ * <br> will return true. </code>
+ *
+ * @return true if the domain names are equal
+ * @throws MalformedURLException
+ */
+ public static boolean isSameDomainName(String url1, String url2)
+ throws MalformedURLException {
+ return isSameDomainName(new URL(url1), new URL(url2));
+ }
+
+ /**
+ * Returns the {@link DomainSuffix} corresponding to the last public part of
+ * the hostname
+ */
+ public static DomainSuffix getDomainSuffix(URL url) {
+ DomainSuffixes tlds = DomainSuffixes.getInstance();
+ String host = url.getHost();
+ if (IP_PATTERN.matcher(host).matches())
+ return null;
+
+ int index = 0;
+ String candidate = host;
+ for (; index >= 0;) {
+ index = candidate.indexOf('.');
+ String subCandidate = candidate.substring(index + 1);
+ DomainSuffix d = tlds.get(subCandidate);
+ if (d != null) {
+ return d;
+ }
+ candidate = subCandidate;
+ }
+ return null;
+ }
+
+ /**
+ * Returns the {@link DomainSuffix} corresponding to the last public part of
+ * the hostname
+ */
+ public static DomainSuffix getDomainSuffix(String url)
+ throws MalformedURLException {
+ return getDomainSuffix(new URL(url));
+ }
+
+ /** Partitions of the hostname of the url by "." */
+ public static String[] getHostSegments(URL url) {
+ String host = url.getHost();
+ // return whole hostname, if it is an ipv4
+ // TODO : handle ipv6
+ if (IP_PATTERN.matcher(host).matches())
+ return new String[] { host };
+ return host.split("\\.");
+ }
+
+ /**
+ * Partitions of the hostname of the url by "."
+ *
+ * @throws MalformedURLException
+ */
+ public static String[] getHostSegments(String url)
+ throws MalformedURLException {
+ return getHostSegments(new URL(url));
+ }
+
+ /**
+ * <p>
+ * Given two urls, a src and a destination of a redirect, it returns the
+ * representative url.
+ * <p>
+ *
+ * <p>
+ * This method implements an extended version of the algorithm used by the
+ * Yahoo! Slurp crawler described here:<br>
+ * <a href=
+ * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How
+ * does the Yahoo! webcrawler handle redirects?</a> <br>
+ * <br>
+ * <ol>
+ * <li>Choose target url if either url is malformed.</li>
+ * <li>If different domains the keep the destination whether or not the
+ * redirect is temp or perm</li>
+ * <ul>
+ * <li>a.com -> b.com*</li>
+ * </ul>
+ * <li>If the redirect is permanent and the source is root, keep the source.</li>
+ * <ul>
+ * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li>
+ * </ul>
+ * <li>If the redirect is permanent and the source is not root and the
+ * destination is root, keep the destination</li>
+ * <ul>
+ * <li>a.com/xyz/index.html -> a.com*</li>
+ * </ul>
+ * <li>If the redirect is permanent and neither the source nor the destination
+ * is root, then keep the destination</li>
+ * <ul>
+ * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+ * </ul>
+ * <li>If the redirect is temporary and source is root and destination is not
+ * root, then keep the source</li>
+ * <ul>
+ * <li>*a.com -> a.com/xyz/index.html</li>
+ * </ul>
+ * <li>If the redirect is temporary and source is not root and destination is
+ * root, then keep the destination</li>
+ * <ul>
+ * <li>a.com/xyz/index.html -> a.com*</li>
+ * </ul>
+ * <li>If the redirect is temporary and neither the source or the destination
+ * is root, then keep the shortest url. First check for the shortest host, and
+ * if both are equal then check by path. Path is first by length then by the
+ * number of / path separators.</li>
+ * <ul>
+ * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+ * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li>
+ * </ul>
+ * <li>If the redirect is temporary and both the source and the destination
+ * are root, then keep the shortest sub-domain</li>
+ * <ul>
+ * <li>*www.a.com -> www.news.a.com</li>
+ * </ul>
+ * <br>
+ * While not in this logic there is a further piece of representative url
+ * logic that occurs during indexing and after scoring. During creation of the
+ * basic fields before indexing, if a url has a representative url stored we
+ * check both the url and its representative url (which should never be the
+ * same) against their linkrank scores and the highest scoring one is kept as
+ * the url and the lower scoring one is held as the orig url inside of the
+ * index.
+ *
+ * @param src
+ * The source url.
+ * @param dst
+ * The destination url.
+ * @param temp
+ * Is the redirect a temporary redirect.
+ *
+ * @return String The representative url.
+ */
+ public static String chooseRepr(String src, String dst, boolean temp) {
+
+ // validate both are well formed urls
+ URL srcUrl;
+ URL dstUrl;
+ try {
+ srcUrl = new URL(src);
+ dstUrl = new URL(dst);
+ } catch (MalformedURLException e) {
+ return dst;
+ }
+
+ // get the source and destination domain, host, and page
+ String srcDomain = URLUtil.getDomainName(srcUrl);
+ String dstDomain = URLUtil.getDomainName(dstUrl);
+ String srcHost = srcUrl.getHost();
+ String dstHost = dstUrl.getHost();
+ String srcFile = srcUrl.getFile();
+ String dstFile = dstUrl.getFile();
+
+ // are the source and destination the root path url.com/ or url.com
+ boolean srcRoot = (srcFile.equals("/") || srcFile.length() == 0);
+ boolean destRoot = (dstFile.equals("/") || dstFile.length() == 0);
+
+ // 1) different domain them keep dest, temp or perm
+ // a.com -> b.com*
+ //
+ // 2) permanent and root, keep src
+ // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
+ //
+ // 3) permanent and not root and dest root, keep dest
+ // a.com/xyz/index.html -> a.com*
+ //
+ // 4) permanent and neither root keep dest
+ // a.com/xyz/index.html -> a.com/abc/page.html*
+ //
+ // 5) temp and root and dest not root keep src
+ // *a.com -> a.com/xyz/index.html
+ //
+ // 7) temp and not root and dest root keep dest
+ // a.com/xyz/index.html -> a.com*
+ //
+ // 8) temp and neither root, keep shortest, if hosts equal by path else by
+ // hosts. paths are first by length then by number of / separators
+ // a.com/xyz/index.html -> a.com/abc/page.html*
+ // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
+ //
+ // 9) temp and both root keep shortest sub domain
+ // *www.a.com -> www.news.a.com
+
+ // if we are dealing with a redirect from one domain to another keep the
+ // destination
+ if (!srcDomain.equals(dstDomain)) {
+ return dst;
+ }
+
+ // if it is a permanent redirect
+ if (!temp) {
+
+ // if source is root return source, otherwise destination
+ if (srcRoot) {
+ return src;
+ } else {
+ return dst;
+ }
+ } else { // temporary redirect
+
+ // source root and destination not root
+ if (srcRoot && !destRoot) {
+ return src;
+ } else if (!srcRoot && destRoot) { // destination root and source not
+ return dst;
+ } else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {
+
+ // source and destination hosts are the same, check paths, host length
+ int numSrcPaths = srcFile.split("/").length;
+ int numDstPaths = dstFile.split("/").length;
+ if (numSrcPaths != numDstPaths) {
+ return (numDstPaths < numSrcPaths ? dst : src);
+ } else {
+ int srcPathLength = srcFile.length();
+ int dstPathLength = dstFile.length();
+ return (dstPathLength < srcPathLength ? dst : src);
+ }
+ } else {
+
+ // different host names and both root take the shortest
+ int numSrcSubs = srcHost.split("\\.").length;
+ int numDstSubs = dstHost.split("\\.").length;
+ return (numDstSubs < numSrcSubs ? dst : src);
+ }
+ }
+ }
+
+ /**
+ * Returns the lowercased hostname for the url or null if the url is not well
+ * formed.
+ *
+ * @param url
+ * The url to check.
+ * @return String The hostname for the url.
+ */
+ public static String getHost(String url) {
+ try {
+ return new URL(url).getHost().toLowerCase();
+ } catch (MalformedURLException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Returns the page for the url. The page consists of the protocol, host, and
+ * path, but does not include the query string. The host is lowercased but the
+ * path is not.
+ *
+ * @param url
+ * The url to check.
+ * @return String The page for the url.
+ */
+ public static String getPage(String url) {
+ try {
+ // get the full url, and replace the query string with and empty string
+ url = url.toLowerCase();
+ String queryStr = new URL(url).getQuery();
+ return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
+ } catch (MalformedURLException e) {
+ return null;
+ }
+ }
+
+ public static String getProtocol(String url) {
+ try {
+ return getProtocol(new URL(url));
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ public static String getProtocol(URL url) {
+ return url.getProtocol();
+ }
+
+ public static String toASCII(String url) {
+ try {
+ URL u = new URL(url);
+ String host = u.getHost();
+ if (host == null || host.isEmpty()) {
+ // no host name => no punycoded domain name
+ // also do not add additional slashes for file: URLs (NUTCH-1880)
+ return url;
+ }
+ URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
+ u.getPort(), u.getPath(), u.getQuery(), u.getRef());
+
+ return p.toString();
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ public static String toUNICODE(String url) {
+ try {
+ URL u = new URL(url);
+ String host = u.getHost();
+ if (host == null || host.isEmpty()) {
+ // no host name => no punycoded domain name
+ // also do not add additional slashes for file: URLs (NUTCH-1880)
+ return url;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append(u.getProtocol());
+ sb.append("://");
+ if (u.getUserInfo() != null) {
+ sb.append(u.getUserInfo());
+ sb.append('@');
+ }
+ sb.append(IDN.toUnicode(host));
+ if (u.getPort() != -1) {
+ sb.append(':');
+ sb.append(u.getPort());
+ }
+ sb.append(u.getFile()); // includes query
+ if (u.getRef() != null) {
+ sb.append('#');
+ sb.append(u.getRef());
+ }
+
+ return sb.toString();
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ /** For testing */
+ public static void main(String[] args) {
+
+ if (args.length != 1) {
+ System.err.println("Usage : URLUtil <url>");
+ return;
+ }
+
+ String url = args[0];
+ try {
+ System.out.println(URLUtil.getDomainName(new URL(url)));
+ } catch (MalformedURLException ex) {
+ ex.printStackTrace();
+ }
+ }
+}