You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:40 UTC

[56/69] [abbrv] nutch git commit: Moved test sources to maven standard directory

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java
new file mode 100644
index 0000000..15725ae
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -0,0 +1,347 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+  private static final String[] testPages = {
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\">"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+      // test frameset link extraction. The invalid frame in the middle will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+              + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\"><!--no anchor--></a>"
+          + "<a href=\"g1\"> <!--whitespace-->  </a>"
+          + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
+          + "</body></html>"), };
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org",
+      "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+      "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+      "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+      "http://www.nutch.org//", "http://www.nutch.org/",
+      "http://www.nutch.org/", "http://www.nutch.org/",
+      "http://www.nutch.org/;something", "http://www.nutch.org/" };
+
+  private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break . "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title", "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  @Before
+  public void setup() {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser = new DOMFragmentParser();
+    try {
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+    } catch (SAXException e) {
+    }
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        Assert.assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
+    }
+    try {
+      answerOutlinks = new Outlink[][] {
+          { new Outlink("http://www.nutch.org", "anchor"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+          { new Outlink("http://www.nutch.org/", "separate this"),
+              new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/1", "1"),
+              new Outlink("http://www.nutch.org/docs/2", "2"), },
+          { new Outlink("http://www.nutch.org/frames/top.html", ""),
+              new Outlink("http://www.nutch.org/frames/left.html", ""),
+              new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+              new Outlink("http://www.nutch.org/frames/right.html", ""), },
+          { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+              new Outlink("http://www.nutch.org/index.html", ""),
+              new Outlink("http://www.nutch.org/maps/#bottom", ""),
+              new Outlink("http://www.nutch.org/bot.html", ""),
+              new Outlink("http://www.nutch.org/docs/index.html", ""), },
+          { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+          {},
+          { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+          {},
+          { new Outlink("http://www.nutch.org/;x", "anchor1"),
+              new Outlink("http://www.nutch.org/g;x", "anchor2"),
+              new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+          {
+              // this is tricky - see RFC3986 section 5.4.1 example 7
+              new Outlink("http://www.nutch.org/g", "anchor1"),
+              new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+              new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+              new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+              new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+                  "anchor5") },
+          { new Outlink("http://www.nutch.org/g", ""),
+              new Outlink("http://www.nutch.org/g1", ""),
+              new Outlink("http://www.nutch.org/g2", "bla bla"),
+              new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
+
+    } catch (MalformedURLException e) {
+
+    }
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens())
+        return false;
+      if (!st1.nextToken().equals(st2.nextToken()))
+        return false;
+    }
+    if (st2.hasMoreTokens())
+      return false;
+    return true;
+  }
+
+  @Test
+  public void testGetText() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  @Test
+  public void testGetTitle() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java
new file mode 100644
index 0000000..7099f50
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.html.HtmlParser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestHtmlParser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TestHtmlParser.class);
+
+  private static final String encodingTestKeywords = "fran�ais, espa�ol, \u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a, \u010de\u0161tina, \u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac";
+  private static final String encodingTestBody = "<ul>\n  <li>fran�ais\n  <li>espa�ol\n  <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n  <li>\u010de\u0161tina\n  <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
+  private static final String encodingTestContent = "<title>"
+      + encodingTestKeywords + "</title>\n"
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages = {
+      {
+          "HTML4, utf-8, meta http-equiv, no quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML4, utf-8, meta http-equiv, single quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+              + encodingTestContent },
+      {
+          "XHTML, utf-8, meta http-equiv, double quotes",
+          "utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML5, utf-8, meta charset",
+          "utf-8",
+          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+              + encodingTestContent },
+      { "HTML5, utf-8, BOM", "utf-8",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+      { "HTML5, utf-16, BOM", "utf-16",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
+
+  private Configuration conf;
+  private Parser parser;
+
+  public TestHtmlParser() {
+    conf = NutchConfiguration.create();
+    parser = new HtmlParser();
+    parser.setConf(conf);
+  }
+
+  protected Parse parse(byte[] contentBytes) {
+    String dummyUrl = "http://dummy.url/";
+    return parser.getParse(
+        new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
+            new Metadata(), conf)).get(dummyUrl);
+  }
+
+  @Test
+  public void testEncodingDetection() {
+    for (String[] testPage : encodingTestPages) {
+      String name = testPage[0];
+      Charset charset = Charset.forName(testPage[1]);
+      byte[] contentBytes = testPage[2].getBytes(charset);
+      Parse parse = parse(contentBytes);
+      String text = parse.getText();
+      String title = parse.getData().getTitle();
+      String keywords = parse.getData().getMeta("keywords");
+      LOG.info(name);
+      LOG.info("title:\t" + title);
+      LOG.info("keywords:\t" + keywords);
+      LOG.info("text:\t" + text);
+      Assert.assertEquals("Title not extracted properly (" + name + ")",
+          encodingTestKeywords, title);
+      for (String keyword : encodingTestKeywords.split(",\\s*")) {
+        Assert.assertTrue(keyword + " not found in text (" + name + ")",
+            text.contains(keyword));
+      }
+      Assert.assertNotNull("No keywords extracted", keywords);
+      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+          encodingTestKeywords, keywords);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..5089a10
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Test;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org/foo/"),
+              new URL("http://www.nutch.org/") },
+          { new URL("http://www.nutch.org"),
+              new URL("http://www.nutch.org/base/") } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
deleted file mode 100644
index 15725ae..0000000
--- a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ /dev/null
@@ -1,347 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.cyberneko.html.parsers.*;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/**
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils {
-
-  private static final String[] testPages = {
-      new String("<html><head><title> title </title><script> script </script>"
-          + "</head><body> body <a href=\"http://www.nutch.org\">"
-          + " anchor </a><!--comment-->" + "</body></html>"),
-      new String("<html><head><title> title </title><script> script </script>"
-          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
-          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
-          + "</body></html>"),
-      new String("<html><head><title> </title>" + "</head><body> "
-          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
-          + "</a></a>" + "</body></html>"),
-      // this one relies on certain neko fixup behavior, possibly
-      // distributing the anchors into the LI's-but not the other
-      // anchors (outside of them, instead)! So you get a tree that
-      // looks like:
-      // ... <li> <a href=/> home </a> </li>
-      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
-      new String("<html><head><title> my title </title>"
-          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
-          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
-          + "</body></html>"),
-      // test frameset link extraction. The invalid frame in the middle will be
-      // fixed to a third standalone frame.
-      new String("<html><head><title> my title </title>"
-          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
-          + "</frame>" + "<frameset cols=\"20,*\">"
-          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
-          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
-          + "</frameset>" + "</frameset>" + "</body></html>"),
-      // test <area> and <iframe> link extraction + url normalization
-      new String(
-          "<html><head><title> my title </title>"
-              + "</head><body>"
-              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-              + "<map name=\"green\">"
-              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
-              + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
-              + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
-              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
-              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
-      // test whitespace processing for plain text extraction
-      new String(
-          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
-              + " </head>\n"
-              + " <body>\n"
-              + "    <h1> Whitespace\ttest  </h1> \n"
-              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
-              + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
-              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
-              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
-              + "<table>"
-              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
-              + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
-              + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-              + "</table>put some text here<Br>and there."
-              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-              + "         .        .        .         ." + "</body>  </html>"),
-
-      // test that <a rel=nofollow> links are not returned
-      new String("<html><head></head><body>"
-          + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
-          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
-          + "</body></html>"),
-      // test that POST form actions are skipped
-      new String("<html><head></head><body>"
-          + "<form method='POST' action='/search.jsp'><input type=text>"
-          + "<input type=submit><p>test1</p></form>"
-          + "<form method='GET' action='/dummy.jsp'><input type=text>"
-          + "<input type=submit><p>test2</p></form></body></html>"),
-      // test that all form actions are skipped
-      new String("<html><head></head><body>"
-          + "<form method='POST' action='/search.jsp'><input type=text>"
-          + "<input type=submit><p>test1</p></form>"
-          + "<form method='GET' action='/dummy.jsp'><input type=text>"
-          + "<input type=submit><p>test2</p></form></body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
-          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
-          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
-          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\"g\"><!--no anchor--></a>"
-          + "<a href=\"g1\"> <!--whitespace-->  </a>"
-          + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
-          + "</body></html>"), };
-
-  private static int SKIP = 9;
-
-  private static String[] testBaseHrefs = { "http://www.nutch.org",
-      "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
-      "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
-      "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
-      "http://www.nutch.org//", "http://www.nutch.org/",
-      "http://www.nutch.org/", "http://www.nutch.org/",
-      "http://www.nutch.org/;something", "http://www.nutch.org/" };
-
-  private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
-
-  private static URL[] testBaseHrefURLs = new URL[testPages.length];
-
-  private static final String[] answerText = {
-      "title body anchor",
-      "title body home bots",
-      "separate this from this",
-      "my title body home 1 2",
-      "my title",
-      "my title the bottom",
-      "my title Whitespace test whitespace test "
-          + "This is a whitespace test . Newlines should appear as space too. "
-          + "Tabs are spaces too. This is a break -> and the line after break . "
-          + "one two three space here space there no space "
-          + "one two two three three four put some text here and there. "
-          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
-      "test1 test2", "title anchor1 anchor2 anchor3",
-      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
-
-  private static final String[] answerTitle = { "title", "title", "",
-      "my title", "my title", "my title", "my title", "", "", "", "title",
-      "title", "title" };
-
-  // note: should be in page-order
-  private static Outlink[][] answerOutlinks;
-
-  private static Configuration conf;
-  private static DOMContentUtils utils = null;
-
-  @Before
-  public void setup() {
-    conf = NutchConfiguration.create();
-    conf.setBoolean("parser.html.form.use_action", true);
-    utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser = new DOMFragmentParser();
-    try {
-      parser
-          .setFeature(
-              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
-              true);
-    } catch (SAXException e) {
-    }
-    for (int i = 0; i < testPages.length; i++) {
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-      try {
-        parser.parse(
-            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
-            node);
-        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
-      } catch (Exception e) {
-        Assert.assertTrue("caught exception: " + e, false);
-      }
-      testDOMs[i] = node;
-    }
-    try {
-      answerOutlinks = new Outlink[][] {
-          { new Outlink("http://www.nutch.org", "anchor"), },
-          { new Outlink("http://www.nutch.org/", "home"),
-              new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
-          { new Outlink("http://www.nutch.org/", "separate this"),
-              new Outlink("http://www.nutch.org/docs/ok", "from this"), },
-          { new Outlink("http://www.nutch.org/", "home"),
-              new Outlink("http://www.nutch.org/docs/1", "1"),
-              new Outlink("http://www.nutch.org/docs/2", "2"), },
-          { new Outlink("http://www.nutch.org/frames/top.html", ""),
-              new Outlink("http://www.nutch.org/frames/left.html", ""),
-              new Outlink("http://www.nutch.org/frames/invalid.html", ""),
-              new Outlink("http://www.nutch.org/frames/right.html", ""), },
-          { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
-              new Outlink("http://www.nutch.org/index.html", ""),
-              new Outlink("http://www.nutch.org/maps/#bottom", ""),
-              new Outlink("http://www.nutch.org/bot.html", ""),
-              new Outlink("http://www.nutch.org/docs/index.html", ""), },
-          { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
-          {},
-          { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
-          {},
-          { new Outlink("http://www.nutch.org/;x", "anchor1"),
-              new Outlink("http://www.nutch.org/g;x", "anchor2"),
-              new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
-          {
-              // this is tricky - see RFC3986 section 5.4.1 example 7
-              new Outlink("http://www.nutch.org/g", "anchor1"),
-              new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
-              new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
-              new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
-              new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
-                  "anchor5") },
-          { new Outlink("http://www.nutch.org/g", ""),
-              new Outlink("http://www.nutch.org/g1", ""),
-              new Outlink("http://www.nutch.org/g2", "bla bla"),
-              new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
-
-    } catch (MalformedURLException e) {
-
-    }
-  }
-
-  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-    StringTokenizer st1 = new StringTokenizer(s1);
-    StringTokenizer st2 = new StringTokenizer(s2);
-
-    while (st1.hasMoreTokens()) {
-      if (!st2.hasMoreTokens())
-        return false;
-      if (!st1.nextToken().equals(st2.nextToken()))
-        return false;
-    }
-    if (st2.hasMoreTokens())
-      return false;
-    return true;
-  }
-
-  @Test
-  public void testGetText() {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      StringBuffer sb = new StringBuffer();
-      utils.getText(sb, testDOMs[i]);
-      String text = sb.toString();
-      Assert.assertTrue(
-          "expecting text: " + answerText[i]
-              + System.getProperty("line.separator")
-              + System.getProperty("line.separator") + "got text: " + text,
-          equalsIgnoreWhitespace(answerText[i], text));
-    }
-  }
-
-  @Test
-  public void testGetTitle() {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      StringBuffer sb = new StringBuffer();
-      utils.getTitle(sb, testDOMs[i]);
-      String text = sb.toString();
-      Assert.assertTrue(
-          "expecting text: " + answerText[i]
-              + System.getProperty("line.separator")
-              + System.getProperty("line.separator") + "got text: " + text,
-          equalsIgnoreWhitespace(answerTitle[i], text));
-    }
-  }
-
-  @Test
-  public void testGetOutlinks() {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
-      if (i == SKIP) {
-        conf.setBoolean("parser.html.form.use_action", false);
-        utils.setConf(conf);
-      } else {
-        conf.setBoolean("parser.html.form.use_action", true);
-        utils.setConf(conf);
-      }
-      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
-      Outlink[] outlinkArr = new Outlink[outlinks.size()];
-      outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
-      compareOutlinks(answerOutlinks[i], outlinkArr);
-    }
-  }
-
-  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-    for (int i = 0; i < o.length; i++) {
-      sb.append(o[i].toString());
-      sb.append(System.getProperty("line.separator"));
-    }
-  }
-
-  private static final String outlinksString(Outlink[] o) {
-    StringBuffer sb = new StringBuffer();
-    appendOutlinks(sb, o);
-    return sb.toString();
-  }
-
-  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
-    if (o1.length != o2.length) {
-      Assert.assertTrue(
-          "got wrong number of outlinks (expecting " + o1.length + ", got "
-              + o2.length + ")" + System.getProperty("line.separator")
-              + "answer: " + System.getProperty("line.separator")
-              + outlinksString(o1) + System.getProperty("line.separator")
-              + "got: " + System.getProperty("line.separator")
-              + outlinksString(o2) + System.getProperty("line.separator"),
-          false);
-    }
-
-    for (int i = 0; i < o1.length; i++) {
-      if (!o1[i].equals(o2[i])) {
-        Assert.assertTrue(
-            "got wrong outlinks at position " + i
-                + System.getProperty("line.separator") + "answer: "
-                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
-                + "', anchor: '" + o1[i].getAnchor() + "'"
-                + System.getProperty("line.separator") + "got: "
-                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
-                + "', anchor: '" + o2[i].getAnchor() + "'", false);
-
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
deleted file mode 100644
index 7099f50..0000000
--- a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import java.nio.charset.Charset;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.html.HtmlParser;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class TestHtmlParser {
-
-  public static final Logger LOG = LoggerFactory
-      .getLogger(TestHtmlParser.class);
-
-  private static final String encodingTestKeywords = "fran�ais, espa�ol, \u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a, \u010de\u0161tina, \u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac";
-  private static final String encodingTestBody = "<ul>\n  <li>fran�ais\n  <li>espa�ol\n  <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n  <li>\u010de\u0161tina\n  <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
-  private static final String encodingTestContent = "<title>"
-      + encodingTestKeywords + "</title>\n"
-      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
-      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
-
-  private static String[][] encodingTestPages = {
-      {
-          "HTML4, utf-8, meta http-equiv, no quotes",
-          "utf-8",
-          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
-              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
-              + "<html>\n<head>\n"
-              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
-              + encodingTestContent },
-      {
-          "HTML4, utf-8, meta http-equiv, single quotes",
-          "utf-8",
-          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
-              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
-              + "<html>\n<head>\n"
-              + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
-              + encodingTestContent },
-      {
-          "XHTML, utf-8, meta http-equiv, double quotes",
-          "utf-8",
-          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
-              + "<html>\n<head>\n"
-              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
-              + encodingTestContent },
-      {
-          "HTML5, utf-8, meta charset",
-          "utf-8",
-          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
-              + encodingTestContent },
-      { "HTML5, utf-8, BOM", "utf-8",
-          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
-      { "HTML5, utf-16, BOM", "utf-16",
-          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
-
-  private Configuration conf;
-  private Parser parser;
-
-  public TestHtmlParser() {
-    conf = NutchConfiguration.create();
-    parser = new HtmlParser();
-    parser.setConf(conf);
-  }
-
-  protected Parse parse(byte[] contentBytes) {
-    String dummyUrl = "http://dummy.url/";
-    return parser.getParse(
-        new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
-            new Metadata(), conf)).get(dummyUrl);
-  }
-
-  @Test
-  public void testEncodingDetection() {
-    for (String[] testPage : encodingTestPages) {
-      String name = testPage[0];
-      Charset charset = Charset.forName(testPage[1]);
-      byte[] contentBytes = testPage[2].getBytes(charset);
-      Parse parse = parse(contentBytes);
-      String text = parse.getText();
-      String title = parse.getData().getTitle();
-      String keywords = parse.getData().getMeta("keywords");
-      LOG.info(name);
-      LOG.info("title:\t" + title);
-      LOG.info("keywords:\t" + keywords);
-      LOG.info("text:\t" + text);
-      Assert.assertEquals("Title not extracted properly (" + name + ")",
-          encodingTestKeywords, title);
-      for (String keyword : encodingTestKeywords.split(",\\s*")) {
-        Assert.assertTrue(keyword + " not found in text (" + name + ")",
-            text.contains(keyword));
-      }
-      Assert.assertNotNull("No keywords extracted", keywords);
-      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
-          encodingTestKeywords, keywords);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
deleted file mode 100644
index 5089a10..0000000
--- a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-
-import org.cyberneko.html.parsers.*;
-import org.junit.Assert;
-import org.junit.Test;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/** Unit tests for HTMLMetaProcessor. */
-public class TestRobotsMetaProcessor {
-
-  /*
-   * 
-   * some sample tags:
-   * 
-   * <meta name="robots" content="index,follow"> <meta name="robots"
-   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
-   * <meta name="robots" content="noindex,nofollow">
-   * 
-   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-   */
-
-  public static String[] tests = {
-      "<html><head><title>test page</title>"
-          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
-          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"all\"> "
-          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
-          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
-          + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"noindex,follow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"index,nofollow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"index,follow\"> "
-          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
-          + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
-          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
-          + " some text" + "</body></html>",
-
-  };
-
-  public static final boolean[][] answers = { { true, true, true }, // NONE
-      { false, false, true }, // all
-      { true, true, true }, // nOnE
-      { true, true, false }, // none
-      { true, true, false }, // noindex,nofollow
-      { true, false, false }, // noindex,follow
-      { false, true, false }, // index,nofollow
-      { false, false, false }, // index,follow
-      { false, false, false }, // missing!
-  };
-
-  private URL[][] currURLsAndAnswers;
-
-  @Test
-  public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser = new DOMFragmentParser();
-    ;
-
-    try {
-      currURLsAndAnswers = new URL[][] {
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org/foo/"),
-              new URL("http://www.nutch.org/") },
-          { new URL("http://www.nutch.org"),
-              new URL("http://www.nutch.org/base/") } };
-    } catch (Exception e) {
-      Assert.assertTrue("couldn't make test URLs!", false);
-    }
-
-    for (int i = 0; i < tests.length; i++) {
-      byte[] bytes = tests[i].getBytes();
-
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-
-      try {
-        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
-      } catch (Exception e) {
-        e.printStackTrace();
-      }
-
-      HTMLMetaTags robotsMeta = new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
-
-      Assert.assertTrue("got index wrong on test " + i,
-          robotsMeta.getNoIndex() == answers[i][0]);
-      Assert.assertTrue("got follow wrong on test " + i,
-          robotsMeta.getNoFollow() == answers[i][1]);
-      Assert.assertTrue("got cache wrong on test " + i,
-          robotsMeta.getNoCache() == answers[i][2]);
-      Assert
-          .assertTrue(
-              "got base href wrong on test " + i + " (got "
-                  + robotsMeta.getBaseHref() + ")",
-              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
-                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
-                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
-
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java
new file mode 100644
index 0000000..024aadf
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.metatags;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMetatagParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testMetatags.html";
+  private String sampleFileMultival = "testMultivalueMetatags.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+
+  @Test
+  /** test multiple metatags resulting in metadata with multiple values */
+  public void testMultiValueMetatags() {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("metatags.names", "keywords,DC.creator");
+    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
+
+    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+    String failMessage = "One value of metatag with multiple values is missing: ";
+
+    Set<String> valueSet = new TreeSet<String>();
+    for (String val : parseMeta.getValues("metatag.dc.creator")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
+    for (String val : expectedValues1) {
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
+    }
+
+    valueSet.clear();
+    for (String val : parseMeta.getValues("metatag.keywords")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues2 = { "robot d'indexation", "web crawler",
+        "Webcrawler" };
+    for (String val : expectedValues2) {
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
deleted file mode 100644
index 024aadf..0000000
--- a/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.metatags;
-
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestMetatagParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  private String sampleDir = System.getProperty("test.data", ".");
-  private String sampleFile = "testMetatags.html";
-  private String sampleFileMultival = "testMultivalueMetatags.html";
-  private String description = "This is a test of description";
-  private String keywords = "This is a test of keywords";
-
-  public Metadata parseMeta(String fileName, Configuration conf) {
-    Metadata metadata = null;
-    try {
-      String urlString = "file:" + sampleDir + fileSeparator + fileName;
-      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      Content content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      metadata = parse.getData().getParseMeta();
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.toString());
-    }
-    return metadata;
-  }
-
-  @Test
-  /** test defaults: keywords and description */
-  public void testIt() {
-    Configuration conf = NutchConfiguration.create();
-
-    // check that we get the same values
-    Metadata parseMeta = parseMeta(sampleFile, conf);
-
-    Assert.assertEquals(description, parseMeta.get("metatag.description"));
-    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
-  }
-
-  @Test
-  /** test multiple metatags resulting in metadata with multiple values */
-  public void testMultiValueMetatags() {
-    Configuration conf = NutchConfiguration.create();
-    conf.set("metatags.names", "keywords,DC.creator");
-    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
-
-    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
-
-    String failMessage = "One value of metatag with multiple values is missing: ";
-
-    Set<String> valueSet = new TreeSet<String>();
-    for (String val : parseMeta.getValues("metatag.dc.creator")) {
-      valueSet.add(val);
-    }
-    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
-    for (String val : expectedValues1) {
-      Assert.assertTrue(failMessage + val, valueSet.contains(val));
-    }
-
-    valueSet.clear();
-    for (String val : parseMeta.getValues("metatag.keywords")) {
-      valueSet.add(val);
-    }
-    String[] expectedValues2 = { "robot d'indexation", "web crawler",
-        "Webcrawler" };
-    for (String val : expectedValues2) {
-      Assert.assertTrue(failMessage + val, valueSet.contains(val));
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java
new file mode 100644
index 0000000..593d5ed
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestParseReplace {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testParseReplace.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
deleted file mode 100644
index 593d5ed..0000000
--- a/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.replace;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestParseReplace {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  private String sampleDir = System.getProperty("test.data", ".");
-  private String sampleFile = "testParseReplace.html";
-  private String description = "This is a test of description";
-  private String keywords = "This is a test of keywords";
-
-  public Metadata parseMeta(String fileName, Configuration conf) {
-    Metadata metadata = null;
-    try {
-      String urlString = "file:" + sampleDir + fileSeparator + fileName;
-      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      Content content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      metadata = parse.getData().getParseMeta();
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.toString());
-    }
-    return metadata;
-  }
-
-  @Test
-  /** test defaults: keywords and description */
-  public void testIt() {
-    Configuration conf = NutchConfiguration.create();
-
-    // check that we get the same values
-    Metadata parseMeta = parseMeta(sampleFile, conf);
-
-    Assert.assertEquals(description, parseMeta.get("metatag.description"));
-    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
new file mode 100644
index 0000000..129b85f
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for SWFParser.
+ */
+public class TestSWFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
+      "test3.swf" };
+  private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
+      "test3.txt" };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+
+      parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+      Assert.assertTrue(sampleTexts[i].equals(text));
+    }
+  }
+
+  public TestSWFParser() {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      try {
+        // read the test string
+        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+            + sampleTexts[i]);
+        StringBuffer sb = new StringBuffer();
+        int len = 0;
+        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+        char[] buf = new char[1024];
+        while ((len = isr.read(buf)) > 0) {
+          sb.append(buf, 0, len);
+        }
+        isr.close();
+        sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim();
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
deleted file mode 100644
index 129b85f..0000000
--- a/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.swf;
-
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for SWFParser.
- */
-public class TestSWFParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
-      "test3.swf" };
-  private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
-      "test3.txt" };
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-    Configuration conf = NutchConfiguration.create();
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-
-      parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
-      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
-      Assert.assertTrue(sampleTexts[i].equals(text));
-    }
-  }
-
-  public TestSWFParser() {
-    for (int i = 0; i < sampleFiles.length; i++) {
-      try {
-        // read the test string
-        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
-            + sampleTexts[i]);
-        StringBuffer sb = new StringBuffer();
-        int len = 0;
-        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
-        char[] buf = new char[1024];
-        while ((len = isr.read(buf)) > 0) {
-          sb.append(buf, 0, len);
-        }
-        isr.close();
-        sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim();
-      } catch (Exception e) {
-        e.printStackTrace();
-      }
-    }
-  }
-
-}