You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:43 UTC

[27/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
new file mode 100644
index 0000000..b4771d0
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
@@ -0,0 +1,49 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.HttpConnection;
+import org.mortbay.jetty.Request;
+import org.mortbay.jetty.handler.AbstractHandler;
+
+public abstract class AbstractTestbedHandler extends AbstractHandler {
+  protected boolean debug = false;
+
+  @Override
+  public void handle(String target, HttpServletRequest req,
+      HttpServletResponse res, int dispatch) throws IOException,
+      ServletException {
+    Request base_request = (req instanceof Request) ? (Request) req
+        : HttpConnection.getCurrentConnection().getRequest();
+    res.addHeader("X-TestbedHandlers", this.getClass().getSimpleName());
+    handle(base_request, res, target, dispatch);
+  }
+
+  public abstract void handle(Request req, HttpServletResponse res,
+      String target, int dispatch) throws IOException, ServletException;
+
+  public void addMyHeader(HttpServletResponse res, String name, String value) {
+    name = "X-" + this.getClass().getSimpleName() + "-" + name;
+    res.addHeader(name, value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java
new file mode 100644
index 0000000..58f1f43
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java
@@ -0,0 +1,56 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Random;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.Request;
+
+public class DelayHandler extends AbstractTestbedHandler {
+
+  public static final long DEFAULT_DELAY = 2000;
+
+  private int delay;
+  private boolean random;
+  private Random r;
+
+  public DelayHandler(int delay) {
+    if (delay < 0) {
+      delay = -delay;
+      random = true;
+      r = new Random(1234567890L); // repeatable random
+    }
+    this.delay = delay;
+  }
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    try {
+      int del = random ? r.nextInt(delay) : delay;
+      Thread.sleep(del);
+      addMyHeader(res, "Delay", String.valueOf(del));
+    } catch (Exception e) {
+
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java
new file mode 100644
index 0000000..a40b199
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java
@@ -0,0 +1,102 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Random;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.HttpURI;
+import org.mortbay.jetty.Request;
+
+public class FakeHandler extends AbstractTestbedHandler {
+  Random r = new Random(1234567890L); // predictable
+
+  private static final String testA = "<html><body><h1>Internet Weather Forecast Accuracy</h1>\n"
+      + "<p>Weather forecasting is a secure and popular online presence, which is understandable. The weather affects most everyone's life, and the Internet can provide information on just about any location at any hour of the day or night. But how accurate is this information? How much can we trust it? Perhaps it is just my skeptical nature (or maybe the seeming unpredictability of nature), but I've never put much weight into weather forecasts - especially those made more than three days in advance. That skepticism progressed to a new high in the Summer of 2004, but I have only now done the research necessary to test the accuracy of online weather forecasts. First the story, then the data.</p>"
+      + "<h2>An Internet Weather Forecast Gone Terribly Awry</h2>"
+      + "<p>It was the Summer of 2004 and my wife and I were gearing up for a trip with another couple to Schlitterbahn in New Braunfels - one of the (if not the) best waterparks ever created. As a matter of course when embarking on a 2.5-hour drive to spend the day in a swimsuit, and given the tendency of the area for natural disasters, we checked the weather. The temperatures looked ideal and, most importantly, the chance of rain was a nice round goose egg.</p>";
+  private static final String testB = "<p>A couple of hours into our Schlitterbahn experience, we got on a bus to leave the 'old section' for the 'new section.' Along the way, clouds gathered and multiple claps of thunder sounded. 'So much for the 0% chance of rain,' I commented. By the time we got to our destination, lightning sightings had led to the slides and pools being evacuated and soon the rain began coming down in torrents - accompanied by voluminous lightning flashes. After at least a half an hour the downpour had subsided, but the lightning showed no sign of letting up, so we began heading back to our vehicles. A hundred yards into the parking lot, we passing a tree that had apparently been split in two during the storm (whether by lightning or wind, I'm not sure). Not but a few yards later, there was a distinct thud and the husband of the couple accompanying us cried out as a near racquetball sized hunk of ice rebounded off of his head and onto the concrete. Soon, simila
 rly sized hail was falling all around us as everyone scampered for cover. Some cowered under overturned trashcans while others were more fortunate and made it indoors.</p>"
+      + "<p>The hail, rain and lightning eventually subsided, but the most alarming news was waiting on cell phone voicemail. A friend who lived in the area had called frantically, knowing we were at the park, as the local news was reporting multiple people had been by struck by lightning at Schlitterbahn during the storm.</p>"
+      + "<p>'So much for the 0% chance of rain,' I repeated.</p></body></html>";
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    HttpURI u = req.getUri();
+    String uri = u.toString();
+    // System.err.println("-faking " + uri.toString());
+    addMyHeader(res, "URI", uri);
+    // don't pass it down the chain
+    req.setHandled(true);
+    res.addHeader("X-Handled-By", getClass().getSimpleName());
+    if (uri.endsWith("/robots.txt")) {
+      return;
+    }
+    res.setContentType("text/html");
+    try {
+      OutputStream os = res.getOutputStream();
+      byte[] bytes = testA.getBytes("UTF-8");
+      os.write(bytes);
+      // record URI
+      String p = "<p>URI: " + uri + "</p>\r\n";
+      os.write(p.getBytes());
+      // fake some links
+      String base;
+      if (u.getPath().length() > 5) {
+        base = u.getPath().substring(0, u.getPath().length() - 5);
+      } else {
+        base = u.getPath();
+      }
+      String prefix = u.getScheme() + "://" + u.getHost();
+      if (u.getPort() != 80 && u.getPort() != -1)
+        base += ":" + u.getPort();
+      if (!base.startsWith("/"))
+        prefix += "/";
+      prefix = prefix + base;
+      for (int i = 0; i < 10; i++) {
+        String link = "<p><a href='" + prefix;
+        if (!prefix.endsWith("/")) {
+          link += "/";
+        }
+        link += i + ".html'>outlink " + i + "</a></p>\r\n";
+        os.write(link.getBytes());
+      }
+      // fake a few links to random nonexistent hosts
+      for (int i = 0; i < 5; i++) {
+        int h = r.nextInt(1000000); // 1 mln hosts
+        String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host "
+            + h + "</a></p>\r\n";
+        os.write(link.getBytes());
+      }
+      // fake a link to the root URL
+      String link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
+      if (u.getPort() != 80 && u.getPort() != -1)
+        link += ":" + u.getPort();
+      link += "/'>site " + u.getHost() + "</a></p>\r\n";
+      os.write(link.getBytes());
+      os.write(testB.getBytes());
+      res.flushBuffer();
+    } catch (IOException ioe) {
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
new file mode 100644
index 0000000..2682f6d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
@@ -0,0 +1,64 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import javax.servlet.Filter;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ServletException;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.http.HttpServletResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.mortbay.jetty.Request;
+
+public class LogDebugHandler extends AbstractTestbedHandler implements Filter {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(LogDebugHandler.class);
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    LOG.info("-- " + req.getMethod() + " " + req.getUri().toString() + "\n"
+        + req.getConnection().getRequestFields());
+  }
+
+  @Override
+  public void doFilter(ServletRequest req, ServletResponse res,
+      FilterChain chain) throws IOException, ServletException {
+    ((HttpServletResponse) res).addHeader("X-Handled-By", "AsyncProxyHandler");
+    ((HttpServletResponse) res).addHeader("X-TestbedHandlers",
+        "AsyncProxyHandler");
+    try {
+      chain.doFilter(req, res);
+    } catch (Throwable e) {
+      ((HttpServletResponse) res).sendError(HttpServletResponse.SC_BAD_REQUEST,
+          e.toString());
+    }
+  }
+
+  @Override
+  public void init(FilterConfig arg0) throws ServletException {
+    // TODO Auto-generated method stub
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
new file mode 100644
index 0000000..ef439a6
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
@@ -0,0 +1,40 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.Request;
+
+public class NotFoundHandler extends AbstractTestbedHandler {
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    // don't pass it down the chain
+    req.setHandled(true);
+    res.addHeader("X-Handled-By", getClass().getSimpleName());
+    addMyHeader(res, "URI", req.getUri().toString());
+    res.sendError(HttpServletResponse.SC_NOT_FOUND, "Not found: "
+        + req.getUri().toString());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java
new file mode 100644
index 0000000..a7e6aeb
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java
@@ -0,0 +1,156 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.mortbay.jetty.Handler;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.HandlerList;
+import org.mortbay.jetty.servlet.ServletHandler;
+import org.mortbay.proxy.AsyncProxyServlet;
+
+public class ProxyTestbed {
+  private static final Logger LOG = LoggerFactory.getLogger(ProxyTestbed.class);
+
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length == 0) {
+      System.err
+          .println("TestbedProxy [-seg <segment_name> | -segdir <segments>] [-port <nnn>] [-forward] [-fake] [-delay nnn] [-debug]");
+      System.err
+          .println("-seg <segment_name>\tpath to a single segment (can be specified multiple times)");
+      System.err
+          .println("-segdir <segments>\tpath to a parent directory of multiple segments (as above)");
+      System.err
+          .println("-port <nnn>\trun the proxy on port <nnn> (special permissions may be needed for ports < 1024)");
+      System.err
+          .println("-forward\tif specified, requests to all unknown urls will be passed to");
+      System.err
+          .println("\t\toriginal servers. If false (default) unknown urls generate 404 Not Found.");
+      System.err
+          .println("-delay\tdelay every response by nnn seconds. If delay is negative use a random value up to nnn");
+      System.err
+          .println("-fake\tif specified, requests to all unknown urls will succeed with fake content");
+      System.exit(-1);
+    }
+
+    Configuration conf = NutchConfiguration.create();
+    int port = conf.getInt("segment.proxy.port", 8181);
+    boolean forward = false;
+    boolean fake = false;
+    boolean delay = false;
+    boolean debug = false;
+    int delayVal = 0;
+
+    HashSet<Path> segs = new HashSet<Path>();
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-segdir")) {
+        FileSystem fs = FileSystem.get(conf);
+        FileStatus[] fstats = fs.listStatus(new Path(args[++i]));
+        Path[] paths = HadoopFSUtil.getPaths(fstats);
+        segs.addAll(Arrays.asList(paths));
+      } else if (args[i].equals("-port")) {
+        port = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-forward")) {
+        forward = true;
+      } else if (args[i].equals("-delay")) {
+        delay = true;
+        delayVal = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-fake")) {
+        fake = true;
+      } else if (args[i].equals("-debug")) {
+        debug = true;
+      } else if (args[i].equals("-seg")) {
+        segs.add(new Path(args[++i]));
+      } else {
+        LOG.error("Unknown argument: " + args[i]);
+        System.exit(-1);
+      }
+    }
+
+    // Create the server
+    Server server = new Server();
+    SocketConnector connector = new SocketConnector();
+    connector.setPort(port);
+    connector.setResolveNames(false);
+    server.addConnector(connector);
+
+    // create a list of handlers
+    HandlerList list = new HandlerList();
+    server.addHandler(list);
+
+    if (debug) {
+      LOG.info("* Added debug handler.");
+      list.addHandler(new LogDebugHandler());
+    }
+
+    if (delay) {
+      LOG.info("* Added delay handler: "
+          + (delayVal < 0 ? "random delay up to " + (-delayVal)
+              : "constant delay of " + delayVal));
+      list.addHandler(new DelayHandler(delayVal));
+    }
+
+    // XXX alternatively, we can add the DispatchHandler as the first one,
+    // XXX to activate handler plugins and redirect requests to appropriate
+    // XXX handlers ... Here we always load these handlers
+
+    Iterator<Path> it = segs.iterator();
+    while (it.hasNext()) {
+      Path p = it.next();
+      try {
+        SegmentHandler segment = new SegmentHandler(conf, p);
+        list.addHandler(segment);
+        LOG.info("* Added segment handler for: " + p);
+      } catch (Exception e) {
+        LOG.warn("Skipping segment '" + p + "': "
+            + StringUtils.stringifyException(e));
+      }
+    }
+    if (forward) {
+      LOG.info("* Adding forwarding proxy for all unknown urls ...");
+      ServletHandler servlets = new ServletHandler();
+      servlets.addServletWithMapping(AsyncProxyServlet.class, "/*");
+      servlets.addFilterWithMapping(LogDebugHandler.class, "/*", Handler.ALL);
+      list.addHandler(servlets);
+    }
+    if (fake) {
+      LOG.info("* Added fake handler for remaining URLs.");
+      list.addHandler(new FakeHandler());
+    }
+    list.addHandler(new NotFoundHandler());
+    // Start the http server
+    server.start();
+    server.join();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java
new file mode 100644
index 0000000..5d198b4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java
@@ -0,0 +1,255 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Partitioner;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.mortbay.jetty.Request;
+
+/**
+ * XXX should turn this into a plugin?
+ */
+public class SegmentHandler extends AbstractTestbedHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SegmentHandler.class);
+  private Segment seg;
+
+  private static HashMap<Integer, Integer> protoCodes = new HashMap<Integer, Integer>();
+
+  static {
+    protoCodes.put(ProtocolStatus.ACCESS_DENIED,
+        HttpServletResponse.SC_UNAUTHORIZED);
+    protoCodes.put(ProtocolStatus.BLOCKED,
+        HttpServletResponse.SC_SERVICE_UNAVAILABLE);
+    protoCodes.put(ProtocolStatus.EXCEPTION,
+        HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
+    protoCodes.put(ProtocolStatus.FAILED, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.GONE, HttpServletResponse.SC_GONE);
+    protoCodes.put(ProtocolStatus.MOVED,
+        HttpServletResponse.SC_MOVED_PERMANENTLY);
+    protoCodes.put(ProtocolStatus.NOTFETCHING,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.NOTFOUND, HttpServletResponse.SC_NOT_FOUND);
+    protoCodes.put(ProtocolStatus.NOTMODIFIED,
+        HttpServletResponse.SC_NOT_MODIFIED);
+    protoCodes.put(ProtocolStatus.PROTO_NOT_FOUND,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.REDIR_EXCEEDED,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.RETRY, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.ROBOTS_DENIED,
+        HttpServletResponse.SC_FORBIDDEN);
+    protoCodes.put(ProtocolStatus.SUCCESS, HttpServletResponse.SC_OK);
+    protoCodes.put(ProtocolStatus.TEMP_MOVED,
+        HttpServletResponse.SC_MOVED_TEMPORARILY);
+    protoCodes.put(ProtocolStatus.WOULDBLOCK,
+        HttpServletResponse.SC_BAD_REQUEST);
+  }
+
+  private static class SegmentPathFilter implements PathFilter {
+    public static final SegmentPathFilter INSTANCE = new SegmentPathFilter();
+
+    @Override
+    public boolean accept(Path p) {
+      return p.getName().startsWith("part-");
+    }
+
+  }
+
+  private static class Segment implements Closeable {
+
+    private static final Partitioner<Text, Writable> PARTITIONER = new HashPartitioner<Text, Writable>();
+
+    private Path segmentDir;
+
+    private Object cLock = new Object();
+    private Object crawlLock = new Object();
+    private MapFile.Reader[] content;
+    private MapFile.Reader[] parseText;
+    private MapFile.Reader[] parseData;
+    private MapFile.Reader[] crawl;
+    private Configuration conf;
+
+    public Segment(FileSystem fs, Path segmentDir, Configuration conf)
+        throws IOException {
+      this.segmentDir = segmentDir;
+      this.conf = conf;
+    }
+
+    public CrawlDatum getCrawlDatum(Text url) throws IOException {
+      synchronized (crawlLock) {
+        if (crawl == null)
+          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
+      }
+      return (CrawlDatum) getEntry(crawl, url, new CrawlDatum());
+    }
+
+    public Content getContent(Text url) throws IOException {
+      synchronized (cLock) {
+        if (content == null)
+          content = getReaders(Content.DIR_NAME);
+      }
+      return (Content) getEntry(content, url, new Content());
+    }
+
+    /** Open the output generated by this format. */
+    private MapFile.Reader[] getReaders(String subDir) throws IOException {
+      Path dir = new Path(segmentDir, subDir);
+      FileSystem fs = dir.getFileSystem(conf);
+      Path[] names = FileUtil.stat2Paths(fs.listStatus(dir,
+          SegmentPathFilter.INSTANCE));
+
+      // sort names, so that hash partitioning works
+      Arrays.sort(names);
+
+      MapFile.Reader[] parts = new MapFile.Reader[names.length];
+      for (int i = 0; i < names.length; i++) {
+        parts[i] = new MapFile.Reader(names[i], conf);
+      }
+      return parts;
+    }
+
+    private Writable getEntry(MapFile.Reader[] readers, Text url, Writable entry)
+        throws IOException {
+      return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
+    }
+
+    public void close() throws IOException {
+      if (content != null) {
+        closeReaders(content);
+      }
+      if (parseText != null) {
+        closeReaders(parseText);
+      }
+      if (parseData != null) {
+        closeReaders(parseData);
+      }
+      if (crawl != null) {
+        closeReaders(crawl);
+      }
+    }
+
+    private void closeReaders(MapFile.Reader[] readers) throws IOException {
+      for (int i = 0; i < readers.length; i++) {
+        readers[i].close();
+      }
+    }
+
+  }
+
+  public SegmentHandler(Configuration conf, Path name) throws Exception {
+    seg = new Segment(FileSystem.get(conf), name, conf);
+  }
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    try {
+      String uri = req.getUri().toString();
+      LOG.info("URI: " + uri);
+      addMyHeader(res, "URI", uri);
+      Text url = new Text(uri.toString());
+      CrawlDatum cd = seg.getCrawlDatum(url);
+      if (cd != null) {
+        addMyHeader(res, "Res", "found");
+        LOG.info("-got " + cd.toString());
+        ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(
+            Nutch.WRITABLE_PROTO_STATUS_KEY);
+        if (ps != null) {
+          Integer TrCode = protoCodes.get(ps.getCode());
+          if (TrCode != null) {
+            res.setStatus(TrCode.intValue());
+          } else {
+            res.setStatus(HttpServletResponse.SC_OK);
+          }
+          addMyHeader(res, "ProtocolStatus", ps.toString());
+        } else {
+          res.setStatus(HttpServletResponse.SC_OK);
+        }
+        Content c = seg.getContent(url);
+        if (c == null) { // missing content
+          req.setHandled(true);
+          res.addHeader("X-Handled-By", getClass().getSimpleName());
+          return;
+        }
+        byte[] data = c.getContent();
+        LOG.debug("-data len=" + data.length);
+        Metadata meta = c.getMetadata();
+        String[] names = meta.names();
+        LOG.debug("- " + names.length + " meta");
+        for (int i = 0; i < names.length; i++) {
+          boolean my = true;
+          char ch = names[i].charAt(0);
+          if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
+            // pretty good chance it's a standard header
+            my = false;
+          }
+          String[] values = meta.getValues(names[i]);
+          for (int k = 0; k < values.length; k++) {
+            if (my) {
+              addMyHeader(res, names[i], values[k]);
+            } else {
+              res.addHeader(names[i], values[k]);
+            }
+          }
+        }
+        req.setHandled(true);
+        res.addHeader("X-Handled-By", getClass().getSimpleName());
+        res.setContentType(meta.get(Metadata.CONTENT_TYPE));
+        res.setContentLength(data.length);
+        OutputStream os = res.getOutputStream();
+        os.write(data, 0, data.length);
+        res.flushBuffer();
+      } else {
+        addMyHeader(res, "Res", "not found");
+        LOG.info(" -not found " + url);
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      LOG.warn(StringUtils.stringifyException(e));
+      addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java
new file mode 100644
index 0000000..cc820a7
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Proxy to {@link org.apache.nutch.tools.Benchmark benchmark} the crawler.
+ */
+package org.apache.nutch.tools.proxy;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java b/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java
new file mode 100644
index 0000000..03caa48
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class DumpFileUtilTest {
+
+    @Test
+    public void testGetUrlMD5() throws Exception {
+        String testUrl = "http://apache.org";
+
+        String result = DumpFileUtil.getUrlMD5(testUrl);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7", result);
+    }
+
+    @Test
+    public void testCreateTwoLevelsDirectory() throws Exception {
+        String testUrl = "http://apache.org";
+        String basePath = "/tmp";
+        String fullDir = DumpFileUtil.createTwoLevelsDirectory(basePath, DumpFileUtil.getUrlMD5(testUrl));
+
+        assertEquals("/tmp/96/ea", fullDir);
+
+        String basePath2 = "/this/path/is/not/existed/just/for/testing";
+        String fullDir2 = DumpFileUtil.createTwoLevelsDirectory(basePath2, DumpFileUtil.getUrlMD5(testUrl));
+
+        assertNull(fullDir2);
+    }
+
+    @Test
+    public void testCreateFileName() throws Exception {
+        String testUrl = "http://apache.org";
+        String baseName = "test";
+        String extension = "html";
+        String fullDir = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), baseName, extension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_test.html", fullDir);
+
+        String tooLongBaseName = "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
+        String fullDir2 = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), tooLongBaseName, extension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_testtesttesttesttesttesttesttest.html", fullDir2);
+
+        String tooLongExtension = "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
+        String fullDir3 = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), baseName, tooLongExtension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_test.testt", fullDir3);
+    }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java b/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java
new file mode 100644
index 0000000..8697a62
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestEncodingDetector {
+  private static Configuration conf = NutchConfiguration.create();
+
+  private static byte[] contentInOctets;
+
+  static {
+    try {
+      contentInOctets = "�����\u0414\u041b\u0436\u04b6".getBytes("utf-8");
+    } catch (UnsupportedEncodingException e) {
+      // not possible
+    }
+  }
+
+  @Test
+  public void testGuessing() {
+    // first disable auto detection
+    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
+
+    Metadata metadata = new Metadata();
+    EncodingDetector detector;
+    Content content;
+    String encoding;
+
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    encoding = detector.guessEncoding(content, "windows-1252");
+    // no information is available, so it should return default encoding
+    Assert.assertEquals("windows-1252", encoding.toLowerCase());
+
+    metadata.clear();
+    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    encoding = detector.guessEncoding(content, "windows-1252");
+    Assert.assertEquals("utf-16", encoding.toLowerCase());
+
+    metadata.clear();
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    detector.addClue("windows-1254", "sniffed");
+    encoding = detector.guessEncoding(content, "windows-1252");
+    Assert.assertEquals("windows-1254", encoding.toLowerCase());
+
+    // enable autodetection
+    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
+    metadata.clear();
+    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    detector.addClue("utf-32", "sniffed");
+    encoding = detector.guessEncoding(content, "windows-1252");
+    Assert.assertEquals("utf-8", encoding.toLowerCase());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java b/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java
new file mode 100644
index 0000000..a3d4610
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for GZIPUtils methods. */
+public class TestGZIPUtils {
+
+  /* a short, highly compressable, string */
+  String SHORT_TEST_STRING = "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbcccccccccccccccc";
+
+  /* a short, highly compressable, string */
+  String LONGER_TEST_STRING = SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING;
+
+  /* a snapshot of the nutch webpage */
+  String WEBPAGE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+      + "<html>\n"
+      + "<head>\n"
+      + "  <meta http-equiv=\"content-type\"\n"
+      + " content=\"text/html; charset=ISO-8859-1\">\n"
+      + "  <title>Nutch</title>\n"
+      + "</head>\n"
+      + "<body>\n"
+      + "<h1\n"
+      + " style=\"font-family: helvetica,arial,sans-serif; text-align: center; color: rgb(255, 153, 0);\"><a\n"
+      + " href=\"http://www.nutch.org/\"><font style=\"color: rgb(255, 153, 0);\">Nutch</font></a><br>\n"
+      + "<small>an open source web-search engine</small></h1>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
+      + "<table\n"
+      + " style=\"width: 100%; text-align: left; margin-left: auto; margin-right: auto;\"\n"
+      + " border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n"
+      + "  <tbody>\n"
+      + "    <tr>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/project/showfiles.php?group_id=59548\">Download</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"tutorial.html\">Tutorial</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/nutch/nutch/\">CVS</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"api/index.html\">Javadoc</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/tracker/?atid=491356&amp;group_id=59548&amp;func=browse\">Bugs</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/mail/?group_id=59548\">Lists</a></td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"policies.html\">Policies</a><br>\n"
+      + "      </td>\n"
+      + "    </tr>\n"
+      + "  </tbody>\n"
+      + "</table>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
+      + "<h2>Introduction</h2>\n"
+      + "Nutch is a nascent effort to implement an open-source web search\n"
+      + "engine. Web search is a basic requirement for internet navigation, yet\n"
+      + "the number of web search engines is decreasing. Today's oligopoly could\n"
+      + "soon be a monopoly, with a single company controlling nearly all web\n"
+      + "search for its commercial gain. &nbsp;That would not be good for the\n"
+      + "users of internet. &nbsp;Nutch aims to enable anyone to easily and\n"
+      + "cost-effectively deploy a world-class web search engine.<br>\n"
+      + "<br>\n"
+      + "To succeed, the Nutch software must be able to:<br>\n"
+      + "<ul>\n"
+      + "  <li> crawl several billion pages per month</li>\n"
+      + "  <li>maintain an index of these pages</li>\n"
+      + "  <li>search that index up to 1000 times per second</li>\n"
+      + "  <li>provide very high quality search results</li>\n"
+      + "  <li>operate at minimal cost</li>\n"
+      + "</ul>\n"
+      + "<h2>Status</h2>\n"
+      + "Currently we're just a handful of developers working part-time to put\n"
+      + "together a demo. &nbsp;The demo is coded entirely in Java. &nbsp;However\n"
+      + "persistent data is written in well-documented formats so that modules\n"
+      + "may eventually be re-written in other languages (e.g., Perl, C++) as the\n"
+      + "project progresses.<br>\n"
+      + "<br>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\"> <a\n"
+      + " href=\"http://sourceforge.net\"> </a>\n"
+      + "<div style=\"text-align: center;\"><a href=\"http://sourceforge.net\"><img\n"
+      + " src=\"http://sourceforge.net/sflogo.php?group_id=59548&amp;type=1\"\n"
+      + " style=\"border: 0px solid ; width: 88px; height: 31px;\"\n"
+      + " alt=\"SourceForge.net Logo\" title=\"\"></a></div>\n"
+      + "</body>\n"
+      + "</html>\n";
+
+  @Test
+  public void testZipUnzip() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testZipUnzip(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testZipUnzip(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testZipUnzip(testBytes);
+  }
+
+  @Test
+  public void testZipUnzipBestEffort() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testZipUnzipBestEffort(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testZipUnzipBestEffort(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testZipUnzipBestEffort(testBytes);
+  }
+
+  public void testTruncation() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testTruncation(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testTruncation(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testTruncation(testBytes);
+  }
+
+  @Test
+  public void testLimit() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testLimit(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testLimit(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testLimit(testBytes);
+  }
+
+  // helpers
+
+  public void testZipUnzip(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    Assert.assertTrue("compressed array is not smaller!",
+        compressedBytes.length < origBytes.length);
+
+    byte[] uncompressedBytes = null;
+    try {
+      uncompressedBytes = GZIPUtils.unzip(compressedBytes);
+    } catch (IOException e) {
+      e.printStackTrace();
+      Assert.assertTrue("caught exception '" + e + "' during unzip()", false);
+    }
+    Assert.assertTrue("uncompressedBytes is wrong size",
+        uncompressedBytes.length == origBytes.length);
+
+    for (int i = 0; i < origBytes.length; i++)
+      if (origBytes[i] != uncompressedBytes[i])
+        Assert.assertTrue("uncompressedBytes does not match origBytes", false);
+  }
+
+  public void testZipUnzipBestEffort(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    Assert.assertTrue("compressed array is not smaller!",
+        compressedBytes.length < origBytes.length);
+
+    byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes);
+    Assert.assertTrue("uncompressedBytes is wrong size",
+        uncompressedBytes.length == origBytes.length);
+
+    for (int i = 0; i < origBytes.length; i++)
+      if (origBytes[i] != uncompressedBytes[i])
+        Assert.assertTrue("uncompressedBytes does not match origBytes", false);
+  }
+
+  public void testTruncation(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    System.out.println("original data has len " + origBytes.length);
+    System.out.println("compressed data has len " + compressedBytes.length);
+
+    for (int i = compressedBytes.length; i >= 0; i--) {
+
+      byte[] truncCompressed = new byte[i];
+
+      for (int j = 0; j < i; j++)
+        truncCompressed[j] = compressedBytes[j];
+
+      byte[] trunc = GZIPUtils.unzipBestEffort(truncCompressed);
+
+      if (trunc == null) {
+        System.out.println("truncated to len " + i + ", trunc is null");
+      } else {
+        System.out.println("truncated to len " + i + ", trunc.length=  "
+            + trunc.length);
+
+        for (int j = 0; j < trunc.length; j++)
+          if (trunc[j] != origBytes[j])
+            Assert.assertTrue("truncated/uncompressed array differs at pos "
+                + j + " (compressed data had been truncated to len " + i + ")",
+                false);
+      }
+    }
+  }
+
+  public void testLimit(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    Assert.assertTrue("compressed array is not smaller!",
+        compressedBytes.length < origBytes.length);
+
+    for (int i = 0; i < origBytes.length; i++) {
+
+      byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes, i);
+
+      Assert.assertTrue("uncompressedBytes is wrong size",
+          uncompressedBytes.length == i);
+
+      for (int j = 0; j < i; j++)
+        if (origBytes[j] != uncompressedBytes[j])
+          Assert
+              .assertTrue("uncompressedBytes does not match origBytes", false);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
new file mode 100644
index 0000000..d812110
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.io.Files;
+
+import junit.framework.TestCase;
+import org.apache.nutch.test.TestUtils;
+
+public class TestMimeUtil extends TestCase {
+
+  public static String urlPrefix = "http://localhost/";
+
+  private static Charset defaultCharset = Charset.forName("UTF-8");
+
+  private File sampleDir;
+  {
+    try {
+      sampleDir = TestUtils.getFile(this, "test-mime-util");
+    } catch (FileNotFoundException e){
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * test data, every element on "test page":
+   * <ol>
+   * <li>MIME type</li>
+   * <li>file name (last URL path element)</li>
+   * <li>Content-Type (HTTP header)</li>
+   * <li>content: if empty, do not test MIME magic</li>
+   * </ol>
+   */
+  public static String[][] textBasedFormats = {
+      {
+          "text/html",
+          "test.html",
+          "text/html; charset=utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />\n"
+              + "</head>\n<body>Hello, World!</body></html>" },
+      {
+          "text/html",
+          "test.html",
+          "", // no Content-Type in HTTP header => test URL pattern
+          "<!DOCTYPE html>\n<html>\n<head>\n"
+              + "</head>\n<body>Hello, World!</body></html>" },
+      {
+          "application/xhtml+xml",
+          "test.html",
+          "application/xhtml+xml; charset=utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+              + "</head>\n<body>Hello, World!</body></html>" } };
+
+  public static String[][] binaryFiles = { {
+      "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+      "test.xlsx", "" } };
+
+  private String getMimeType(String url, File file, String contentType,
+      boolean useMagic) throws IOException {
+    return getMimeType(url, Files.toByteArray(file), contentType, useMagic);
+  }
+
+  private String getMimeType(String url, byte[] bytes, String contentType,
+      boolean useMagic) {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("mime.type.magic", useMagic);
+    MimeUtil mimeUtil = new MimeUtil(conf);
+    return mimeUtil.autoResolveContentType(contentType, url, bytes);
+  }
+
+  /** use HTTP Content-Type, URL pattern, and MIME magic */
+  public void testWithMimeMagic() {
+    for (String[] testPage : textBasedFormats) {
+      String mimeType = getMimeType(urlPrefix,
+          testPage[3].getBytes(defaultCharset), testPage[2], true);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+  /** use only HTTP Content-Type (if given) and URL pattern */
+  public void testWithoutMimeMagic() {
+    for (String[] testPage : textBasedFormats) {
+      String mimeType = getMimeType(urlPrefix + testPage[1],
+          testPage[3].getBytes(defaultCharset), testPage[2], false);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+  /** use only MIME magic (detection from content bytes) */
+  public void testOnlyMimeMagic() {
+    for (String[] testPage : textBasedFormats) {
+      String mimeType = getMimeType(urlPrefix,
+          testPage[3].getBytes(defaultCharset), "", true);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+  /** test binary file formats (real files) */
+  public void testBinaryFiles() throws IOException {
+    for (String[] testPage : binaryFiles) {
+      File dataFile = new File(sampleDir, testPage[1]);
+      String mimeType = getMimeType(urlPrefix + testPage[1], dataFile,
+          testPage[2], false);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java b/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java
new file mode 100644
index 0000000..8edf5ab
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayInputStream;
+
+import org.apache.xerces.parsers.DOMParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.w3c.dom.Node;
+import org.xml.sax.InputSource;
+
+/** Unit tests for NodeWalker methods. */
+public class TestNodeWalker {
+
+  /* a snapshot of the nutch webpage */
+  private final static String WEBPAGE = "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\"><head><title>Nutch</title></head>"
+      + "<body>"
+      + "<ul>"
+      + "<li>crawl several billion pages per month</li>"
+      + "<li>maintain an index of these pages</li>"
+      + "<li>search that index up to 1000 times per second</li>"
+      + "<li>provide very high quality search results</li>"
+      + "<li>operate at minimal cost</li>" + "</ul>" + "</body>" + "</html>";
+
+  private final static String[] ULCONTENT = new String[4];
+
+  @Before
+  public void setUp() throws Exception {
+    ULCONTENT[0] = "crawl several billion pages per month";
+    ULCONTENT[1] = "maintain an index of these pages";
+    ULCONTENT[2] = "search that index up to 1000 times per second";
+    ULCONTENT[3] = "operate at minimal cost";
+  }
+
+  @Test
+  public void testSkipChildren() {
+    DOMParser parser = new DOMParser();
+
+    try {
+      parser.setFeature("http://xml.org/sax/features/validation", false);
+      parser.setFeature(
+          "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+          false);
+      parser
+          .parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    StringBuffer sb = new StringBuffer();
+    NodeWalker walker = new NodeWalker(parser.getDocument());
+    while (walker.hasNext()) {
+      Node currentNode = walker.nextNode();
+      short nodeType = currentNode.getNodeType();
+      if (nodeType == Node.TEXT_NODE) {
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        sb.append(text);
+      }
+    }
+    Assert.assertTrue("UL Content can NOT be found in the node",
+        findSomeUlContent(sb.toString()));
+
+    StringBuffer sbSkip = new StringBuffer();
+    NodeWalker walkerSkip = new NodeWalker(parser.getDocument());
+    while (walkerSkip.hasNext()) {
+      Node currentNode = walkerSkip.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      if ("ul".equalsIgnoreCase(nodeName)) {
+        walkerSkip.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        sbSkip.append(text);
+      }
+    }
+    Assert.assertFalse("UL Content can be found in the node",
+        findSomeUlContent(sbSkip.toString()));
+  }
+
+  public boolean findSomeUlContent(String str) {
+    for (int i = 0; i < ULCONTENT.length; i++) {
+      if (str.contains(ULCONTENT[i]))
+        return true;
+    }
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java b/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java
new file mode 100644
index 0000000..9d8b07b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for PrefixStringMatcher. */
+public class TestPrefixStringMatcher {
+
+  private final static int NUM_TEST_ROUNDS = 20;
+  private final static int MAX_TEST_PREFIXES = 100;
+  private final static int MAX_PREFIX_LEN = 10;
+  private final static int NUM_TEST_INPUTS_PER_ROUND = 100;
+  private final static int MAX_INPUT_LEN = 20;
+
+  private final static char[] alphabet = new char[] { 'a', 'b', 'c', 'd',
+  // 'e', 'f', 'g', 'h', 'i', 'j',
+  // 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+  // 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
+  // '5', '6', '7', '8', '9', '0'
+  };
+
+  private String makeRandString(int minLen, int maxLen) {
+    int len = minLen + (int) (Math.random() * (maxLen - minLen));
+    char[] chars = new char[len];
+
+    for (int pos = 0; pos < len; pos++) {
+      chars[pos] = alphabet[(int) (Math.random() * alphabet.length)];
+    }
+
+    return new String(chars);
+  }
+
+  @Test
+  public void testPrefixMatcher() {
+    int numMatches = 0;
+    int numInputsTested = 0;
+
+    for (int round = 0; round < NUM_TEST_ROUNDS; round++) {
+
+      // build list of prefixes
+      int numPrefixes = (int) (Math.random() * MAX_TEST_PREFIXES);
+      String[] prefixes = new String[numPrefixes];
+      for (int i = 0; i < numPrefixes; i++) {
+        prefixes[i] = makeRandString(0, MAX_PREFIX_LEN);
+      }
+
+      PrefixStringMatcher prematcher = new PrefixStringMatcher(prefixes);
+
+      // test random strings for prefix matches
+      for (int i = 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
+        String input = makeRandString(0, MAX_INPUT_LEN);
+        boolean matches = false;
+        int longestMatch = -1;
+        int shortestMatch = -1;
+
+        for (int j = 0; j < prefixes.length; j++) {
+
+          if ((prefixes[j].length() > 0) && input.startsWith(prefixes[j])) {
+
+            matches = true;
+            int matchSize = prefixes[j].length();
+
+            if (matchSize > longestMatch)
+              longestMatch = matchSize;
+
+            if ((matchSize < shortestMatch) || (shortestMatch == -1))
+              shortestMatch = matchSize;
+          }
+
+        }
+
+        if (matches)
+          numMatches++;
+
+        numInputsTested++;
+
+        Assert.assertTrue("'" + input + "' should " + (matches ? "" : "not ")
+            + "match!", matches == prematcher.matches(input));
+        if (matches) {
+          Assert.assertTrue(shortestMatch == prematcher.shortestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(0, shortestMatch).equals(
+              prematcher.shortestMatch(input)));
+
+          Assert.assertTrue(longestMatch == prematcher.longestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(0, longestMatch).equals(
+              prematcher.longestMatch(input)));
+
+        }
+      }
+    }
+
+    System.out.println("got " + numMatches + " matches out of "
+        + numInputsTested + " tests");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java
new file mode 100644
index 0000000..df021f0
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for StringUtil methods. */
+public class TestStringUtil {
+
+  public void testRightPad() {
+    String s = "my string";
+
+    String ps = StringUtil.rightPad(s, 0);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.rightPad(s, 9);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.rightPad(s, 10);
+    Assert.assertTrue((s + " ").equals(ps));
+
+    ps = StringUtil.rightPad(s, 15);
+    Assert.assertTrue((s + "      ").equals(ps));
+
+  }
+
+  @Test
+  public void testLeftPad() {
+    String s = "my string";
+
+    String ps = StringUtil.leftPad(s, 0);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.leftPad(s, 9);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.leftPad(s, 10);
+    Assert.assertTrue((" " + s).equals(ps));
+
+    ps = StringUtil.leftPad(s, 15);
+    Assert.assertTrue(("      " + s).equals(ps));
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java b/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java
new file mode 100644
index 0000000..f2e8a5c
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for SuffixStringMatcher. */
+public class TestSuffixStringMatcher {
+
+  private final static int NUM_TEST_ROUNDS = 20;
+  private final static int MAX_TEST_SUFFIXES = 100;
+  private final static int MAX_SUFFIX_LEN = 10;
+  private final static int NUM_TEST_INPUTS_PER_ROUND = 100;
+  private final static int MAX_INPUT_LEN = 20;
+
+  private final static char[] alphabet = new char[] { 'a', 'b', 'c', 'd',
+  // 'e', 'f', 'g', 'h', 'i', 'j',
+  // 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+  // 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
+  // '5', '6', '7', '8', '9', '0'
+  };
+
+  private String makeRandString(int minLen, int maxLen) {
+    int len = minLen + (int) (Math.random() * (maxLen - minLen));
+    char[] chars = new char[len];
+
+    for (int pos = 0; pos < len; pos++) {
+      chars[pos] = alphabet[(int) (Math.random() * alphabet.length)];
+    }
+
+    return new String(chars);
+  }
+
+  @Test
+  public void testSuffixMatcher() {
+    int numMatches = 0;
+    int numInputsTested = 0;
+
+    for (int round = 0; round < NUM_TEST_ROUNDS; round++) {
+
+      // build list of suffixes
+      int numSuffixes = (int) (Math.random() * MAX_TEST_SUFFIXES);
+      String[] suffixes = new String[numSuffixes];
+      for (int i = 0; i < numSuffixes; i++) {
+        suffixes[i] = makeRandString(0, MAX_SUFFIX_LEN);
+      }
+
+      SuffixStringMatcher sufmatcher = new SuffixStringMatcher(suffixes);
+
+      // test random strings for suffix matches
+      for (int i = 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
+        String input = makeRandString(0, MAX_INPUT_LEN);
+        boolean matches = false;
+        int longestMatch = -1;
+        int shortestMatch = -1;
+
+        for (int j = 0; j < suffixes.length; j++) {
+
+          if ((suffixes[j].length() > 0) && input.endsWith(suffixes[j])) {
+
+            matches = true;
+            int matchSize = suffixes[j].length();
+
+            if (matchSize > longestMatch)
+              longestMatch = matchSize;
+
+            if ((matchSize < shortestMatch) || (shortestMatch == -1))
+              shortestMatch = matchSize;
+          }
+
+        }
+
+        if (matches)
+          numMatches++;
+
+        numInputsTested++;
+
+        Assert.assertTrue("'" + input + "' should " + (matches ? "" : "not ")
+            + "match!", matches == sufmatcher.matches(input));
+        if (matches) {
+          Assert.assertTrue(shortestMatch == sufmatcher.shortestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(input.length() - shortestMatch)
+              .equals(sufmatcher.shortestMatch(input)));
+
+          Assert.assertTrue(longestMatch == sufmatcher.longestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(input.length() - longestMatch)
+              .equals(sufmatcher.longestMatch(input)));
+        }
+      }
+    }
+
+    System.out.println("got " + numMatches + " matches out of "
+        + numInputsTested + " tests");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java
new file mode 100644
index 0000000..fb07556
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.util;
+
+import org.apache.nutch.util.TableUtil;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class TestTableUtil {
+
+  String urlString1 = "http://foo.com/";
+  String urlString2 = "http://foo.com:8900/";
+  String urlString3 = "ftp://bar.baz.com/";
+  String urlString4 = "http://bar.baz.com:8983/to/index.html?a=b&c=d";
+  String urlString5 = "http://foo.com?a=/a/b&c=0";
+  String urlString5rev = "http://foo.com/?a=/a/b&c=0";
+  String urlString6 = "http://foo.com";
+  String urlString7 = "file:///var/www/index.html";
+
+  String reversedUrlString1 = "com.foo:http/";
+  String reversedUrlString2 = "com.foo:http:8900/";
+  String reversedUrlString3 = "com.baz.bar:ftp/";
+  String reversedUrlString4 = "com.baz.bar:http:8983/to/index.html?a=b&c=d";
+  String reversedUrlString5 = "com.foo:http/?a=/a/b&c=0";
+  String reversedUrlString6 = "com.foo:http";
+  String reversedUrlString7 = ":file/var/www/index.html";
+
+  @Test
+  public void testReverseUrl() throws Exception {
+    assertReverse(urlString1, reversedUrlString1);
+    assertReverse(urlString2, reversedUrlString2);
+    assertReverse(urlString3, reversedUrlString3);
+    assertReverse(urlString4, reversedUrlString4);
+    assertReverse(urlString5, reversedUrlString5);
+    assertReverse(urlString5, reversedUrlString5);
+    assertReverse(urlString6, reversedUrlString6);
+    assertReverse(urlString7, reversedUrlString7);
+  }
+
+  @Test
+  public void testUnreverseUrl() throws Exception {
+    assertUnreverse(reversedUrlString1, urlString1);
+    assertUnreverse(reversedUrlString2, urlString2);
+    assertUnreverse(reversedUrlString3, urlString3);
+    assertUnreverse(reversedUrlString4, urlString4);
+    assertUnreverse(reversedUrlString5, urlString5rev);
+    assertUnreverse(reversedUrlString6, urlString6);
+    assertUnreverse(reversedUrlString7, urlString7);
+  }
+
+  private static void assertReverse(String url, String expectedReversedUrl)
+      throws Exception {
+    String reversed = TableUtil.reverseUrl(url);
+    assertEquals(expectedReversedUrl, reversed);
+  }
+
+  private static void assertUnreverse(String reversedUrl, String expectedUrl) {
+    String unreversed = TableUtil.unreverseUrl(reversedUrl);
+    assertEquals(expectedUrl, unreversed);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java
new file mode 100644
index 0000000..b1fdd5b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.URL;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Test class for URLUtil */
+public class TestURLUtil {
+
+  @Test
+  public void testGetDomainName() throws Exception {
+
+    URL url = null;
+
+    url = new URL("http://lucene.apache.org/nutch");
+    Assert.assertEquals("apache.org", URLUtil.getDomainName(url));
+
+    url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
+    Assert.assertEquals("wikipedia.org", URLUtil.getDomainName(url));
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    Assert.assertEquals("140.211.11.130", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.co.uk:8080/index.html");
+    Assert.assertEquals("example.co.uk", URLUtil.getDomainName(url));
+
+    url = new URL("http://com");
+    Assert.assertEquals("com", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.co.uk.com");
+    Assert.assertEquals("uk.com", URLUtil.getDomainName(url));
+
+    // "nn" is not a tld
+    url = new URL("http://example.com.nn");
+    Assert.assertEquals("nn", URLUtil.getDomainName(url));
+
+    url = new URL("http://");
+    Assert.assertEquals("", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.edu.tr.xyz");
+    Assert.assertEquals("xyz", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.c.se");
+    Assert.assertEquals("example.c.se", URLUtil.getDomainName(url));
+
+    // plc.co.im is listed as a domain suffix
+    url = new URL("http://www.example.plc.co.im");
+    Assert.assertEquals("example.plc.co.im", URLUtil.getDomainName(url));
+
+    // 2000.hu is listed as a domain suffix
+    url = new URL("http://www.example.2000.hu");
+    Assert.assertEquals("example.2000.hu", URLUtil.getDomainName(url));
+
+    // test non-ascii
+    url = new URL("http://www.example.\u5546\u696d.tw");
+    Assert.assertEquals("example.\u5546\u696d.tw", URLUtil.getDomainName(url));
+  }
+
+  @Test
+  public void testGetDomainSuffix() throws Exception {
+    URL url = null;
+
+    url = new URL("http://lucene.apache.org/nutch");
+    Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://www.example.co.uk:8080/index.html");
+    Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://com");
+    Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://www.example.co.uk.com");
+    Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+    // "nn" is not a tld
+    url = new URL("http://example.com.nn");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://www.edu.tr.xyz");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://subdomain.example.edu.tr");
+    Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://subdomain.example.presse.fr");
+    Assert.assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://subdomain.example.presse.tr");
+    Assert.assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());
+
+    // plc.co.im is listed as a domain suffix
+    url = new URL("http://www.example.plc.co.im");
+    Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());
+
+    // 2000.hu is listed as a domain suffix
+    url = new URL("http://www.example.2000.hu");
+    Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());
+
+    // test non-ascii
+    url = new URL("http://www.example.\u5546\u696d.tw");
+    Assert.assertEquals("\u5546\u696d.tw", URLUtil.getDomainSuffix(url).getDomain());
+  }
+
+  @Test
+  public void testGetHostSegments() throws Exception {
+    URL url;
+    String[] segments;
+
+    url = new URL("http://subdomain.example.edu.tr");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals("subdomain", segments[0]);
+    Assert.assertEquals("example", segments[1]);
+    Assert.assertEquals("edu", segments[2]);
+    Assert.assertEquals("tr", segments[3]);
+
+    url = new URL("http://");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals(1, segments.length);
+    Assert.assertEquals("", segments[0]);
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals(1, segments.length);
+    Assert.assertEquals("140.211.11.130", segments[0]);
+
+    // test non-ascii
+    url = new URL("http://www.example.\u5546\u696d.tw");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals("www", segments[0]);
+    Assert.assertEquals("example", segments[1]);
+    Assert.assertEquals("\u5546\u696d", segments[2]);
+    Assert.assertEquals("tw", segments[3]);
+
+  }
+
+  @Test
+  public void testChooseRepr() throws Exception {
+
+    String aDotCom = "http://www.a.com";
+    String bDotCom = "http://www.b.com";
+    String aSubDotCom = "http://www.news.a.com";
+    String aQStr = "http://www.a.com?y=1";
+    String aPath = "http://www.a.com/xyz/index.html";
+    String aPath2 = "http://www.a.com/abc/page.html";
+    String aPath3 = "http://www.news.a.com/abc/page.html";
+
+    // 1) different domain them keep dest, temp or perm
+    // a.com -> b.com*
+    Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, true));
+    Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, false));
+
+    // 2) permanent and root, keep src
+    // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aQStr, false));
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, false));
+
+    // 3) permanent and not root and dest root, keep dest
+    // a.com/xyz/index.html -> a.com*
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, false));
+
+    // 4) permanent and neither root keep dest
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, false));
+
+    // 5) temp and root and dest not root keep src
+    // *a.com -> a.com/xyz/index.html
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, true));
+
+    // 6) temp and not root and dest root keep dest
+    // a.com/xyz/index.html -> a.com*
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, true));
+
+    // 7) temp and neither root, keep shortest, if hosts equal by path else by
+    // hosts
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
+    Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, true));
+    Assert.assertEquals(aPath, URLUtil.chooseRepr(aPath, aPath3, true));
+
+    // 8) temp and both root keep shortest sub domain
+    // *www.a.com -> www.news.a.com
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
+  }
+
+  // from RFC3986 section 5.4.1
+  private static String baseString = "http://a/b/c/d;p?q";
+  private static String[][] targets = new String[][] {
+      // unknown protocol {"g:h" , "g:h"},
+      { "g", "http://a/b/c/g" }, { "./g", "http://a/b/c/g" },
+      { "g/", "http://a/b/c/g/" }, { "/g", "http://a/g" },
+      { "//g", "http://g" }, { "?y", "http://a/b/c/d;p?y" },
+      { "g?y", "http://a/b/c/g?y" }, { "#s", "http://a/b/c/d;p?q#s" },
+      { "g#s", "http://a/b/c/g#s" }, { "g?y#s", "http://a/b/c/g?y#s" },
+      { ";x", "http://a/b/c/;x" }, { "g;x", "http://a/b/c/g;x" },
+      { "g;x?y#s", "http://a/b/c/g;x?y#s" }, { "", "http://a/b/c/d;p?q" },
+      { ".", "http://a/b/c/" }, { "./", "http://a/b/c/" },
+      { "..", "http://a/b/" }, { "../", "http://a/b/" },
+      { "../g", "http://a/b/g" }, { "../..", "http://a/" },
+      { "../../", "http://a/" }, { "../../g", "http://a/g" } };
+
+  @Test
+  public void testResolveURL() throws Exception {
+    // test NUTCH-436
+    URL u436 = new URL("http://a/b/c/d;p?q#f");
+    Assert.assertEquals("http://a/b/c/d;p?q#f", u436.toString());
+    URL abs = URLUtil.resolveURL(u436, "?y");
+    Assert.assertEquals("http://a/b/c/d;p?y", abs.toString());
+    // test NUTCH-566
+    URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
+    abs = URLUtil.resolveURL(u566, "?id_entrep=111");
+    Assert.assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111",
+        abs.toString());
+    URL base = new URL(baseString);
+    Assert.assertEquals("base url parsing", baseString, base.toString());
+    for (int i = 0; i < targets.length; i++) {
+      URL u = URLUtil.resolveURL(base, targets[i][0]);
+      Assert.assertEquals(targets[i][1], targets[i][1], u.toString());
+    }
+  }
+
+  @Test
+  public void testToUNICODE() throws Exception {
+    Assert.assertEquals("http://www.�evir.com",
+        URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));
+    Assert.assertEquals("http://uni-t�bingen.de/",
+        URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/"));
+    Assert
+        .assertEquals(
+            "http://www.medizin.uni-t�bingen.de:8080/search.php?q=abc#p1",
+            URLUtil
+                .toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
+
+  }
+
+  @Test
+  public void testToASCII() throws Exception {
+    Assert.assertEquals("http://www.xn--evir-zoa.com",
+        URLUtil.toASCII("http://www.�evir.com"));
+    Assert.assertEquals("http://xn--uni-tbingen-xhb.de/",
+        URLUtil.toASCII("http://uni-t�bingen.de/"));
+    Assert
+        .assertEquals(
+            "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
+            URLUtil
+                .toASCII("http://www.medizin.uni-t�bingen.de:8080/search.php?q=abc#p1"));
+  }
+
+  @Test
+  public void testFileProtocol() throws Exception {
+    // keep one single slash NUTCH-XXX
+    Assert.assertEquals("file:/path/file.html",
+        URLUtil.toASCII("file:/path/file.html"));
+    Assert.assertEquals("file:/path/file.html",
+        URLUtil.toUNICODE("file:/path/file.html"));
+  }
+
+}