You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:21 UTC

[37/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
new file mode 100644
index 0000000..15725ae
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -0,0 +1,347 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+  private static final String[] testPages = {
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\">"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+      // test frameset link extraction. The invalid frame in the middle will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+              + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\"><!--no anchor--></a>"
+          + "<a href=\"g1\"> <!--whitespace-->  </a>"
+          + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
+          + "</body></html>"), };
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org",
+      "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+      "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+      "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+      "http://www.nutch.org//", "http://www.nutch.org/",
+      "http://www.nutch.org/", "http://www.nutch.org/",
+      "http://www.nutch.org/;something", "http://www.nutch.org/" };
+
+  private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break . "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title", "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  @Before
+  public void setup() {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser = new DOMFragmentParser();
+    try {
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+    } catch (SAXException e) {
+    }
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        Assert.assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
+    }
+    try {
+      answerOutlinks = new Outlink[][] {
+          { new Outlink("http://www.nutch.org", "anchor"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+          { new Outlink("http://www.nutch.org/", "separate this"),
+              new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/1", "1"),
+              new Outlink("http://www.nutch.org/docs/2", "2"), },
+          { new Outlink("http://www.nutch.org/frames/top.html", ""),
+              new Outlink("http://www.nutch.org/frames/left.html", ""),
+              new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+              new Outlink("http://www.nutch.org/frames/right.html", ""), },
+          { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+              new Outlink("http://www.nutch.org/index.html", ""),
+              new Outlink("http://www.nutch.org/maps/#bottom", ""),
+              new Outlink("http://www.nutch.org/bot.html", ""),
+              new Outlink("http://www.nutch.org/docs/index.html", ""), },
+          { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+          {},
+          { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+          {},
+          { new Outlink("http://www.nutch.org/;x", "anchor1"),
+              new Outlink("http://www.nutch.org/g;x", "anchor2"),
+              new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+          {
+              // this is tricky - see RFC3986 section 5.4.1 example 7
+              new Outlink("http://www.nutch.org/g", "anchor1"),
+              new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+              new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+              new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+              new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+                  "anchor5") },
+          { new Outlink("http://www.nutch.org/g", ""),
+              new Outlink("http://www.nutch.org/g1", ""),
+              new Outlink("http://www.nutch.org/g2", "bla bla"),
+              new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
+
+    } catch (MalformedURLException e) {
+
+    }
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens())
+        return false;
+      if (!st1.nextToken().equals(st2.nextToken()))
+        return false;
+    }
+    if (st2.hasMoreTokens())
+      return false;
+    return true;
+  }
+
+  @Test
+  public void testGetText() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  @Test
+  public void testGetTitle() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
new file mode 100644
index 0000000..7099f50
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.html.HtmlParser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestHtmlParser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TestHtmlParser.class);
+
+  private static final String encodingTestKeywords = "fran�ais, espa�ol, \u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a, \u010de\u0161tina, \u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac";
+  private static final String encodingTestBody = "<ul>\n  <li>fran�ais\n  <li>espa�ol\n  <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n  <li>\u010de\u0161tina\n  <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
+  private static final String encodingTestContent = "<title>"
+      + encodingTestKeywords + "</title>\n"
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages = {
+      {
+          "HTML4, utf-8, meta http-equiv, no quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML4, utf-8, meta http-equiv, single quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+              + encodingTestContent },
+      {
+          "XHTML, utf-8, meta http-equiv, double quotes",
+          "utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML5, utf-8, meta charset",
+          "utf-8",
+          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+              + encodingTestContent },
+      { "HTML5, utf-8, BOM", "utf-8",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+      { "HTML5, utf-16, BOM", "utf-16",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
+
+  private Configuration conf;
+  private Parser parser;
+
+  public TestHtmlParser() {
+    conf = NutchConfiguration.create();
+    parser = new HtmlParser();
+    parser.setConf(conf);
+  }
+
+  protected Parse parse(byte[] contentBytes) {
+    String dummyUrl = "http://dummy.url/";
+    return parser.getParse(
+        new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
+            new Metadata(), conf)).get(dummyUrl);
+  }
+
+  @Test
+  public void testEncodingDetection() {
+    for (String[] testPage : encodingTestPages) {
+      String name = testPage[0];
+      Charset charset = Charset.forName(testPage[1]);
+      byte[] contentBytes = testPage[2].getBytes(charset);
+      Parse parse = parse(contentBytes);
+      String text = parse.getText();
+      String title = parse.getData().getTitle();
+      String keywords = parse.getData().getMeta("keywords");
+      LOG.info(name);
+      LOG.info("title:\t" + title);
+      LOG.info("keywords:\t" + keywords);
+      LOG.info("text:\t" + text);
+      Assert.assertEquals("Title not extracted properly (" + name + ")",
+          encodingTestKeywords, title);
+      for (String keyword : encodingTestKeywords.split(",\\s*")) {
+        Assert.assertTrue(keyword + " not found in text (" + name + ")",
+            text.contains(keyword));
+      }
+      Assert.assertNotNull("No keywords extracted", keywords);
+      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+          encodingTestKeywords, keywords);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..5089a10
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Test;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org/foo/"),
+              new URL("http://www.nutch.org/") },
+          { new URL("http://www.nutch.org"),
+              new URL("http://www.nutch.org/base/") } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/build.xml b/nutch-plugins/parse-js/build.xml
new file mode 100644
index 0000000..d9c2146
--- /dev/null
+++ b/nutch-plugins/parse-js/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-js" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/ivy.xml b/nutch-plugins/parse-js/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-js/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/plugin.xml b/nutch-plugins/parse-js/plugin.xml
new file mode 100644
index 0000000..9c06c2a
--- /dev/null
+++ b/nutch-plugins/parse-js/plugin.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-js"
+   name="JavaScript Parser"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-js.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.js"
+              name="JS Parser"
+              point="org.apache.nutch.parse.Parser">
+      <implementation id="JSParser"
+         class="org.apache.nutch.parse.js.JSParseFilter">
+        <parameter name="contentType" value="application/x-javascript"/>
+        <parameter name="pathSuffix"  value="js"/>
+      </implementation>
+   </extension>
+   <extension id="org.apache.nutch.parse.js.JSParseFilter"
+              name="Parse JS Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="JSParseFilter"
+         class="org.apache.nutch.parse.js.JSParseFilter">
+        <parameter name="contentType" value="application/x-javascript"/>
+        <parameter name="pathSuffix"  value=""/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/pom.xml b/nutch-plugins/parse-js/pom.xml
new file mode 100644
index 0000000..68d5770
--- /dev/null
+++ b/nutch-plugins/parse-js/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-js</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-js</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java
new file mode 100644
index 0000000..8c95372
--- /dev/null
+++ b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.js;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java
+ */
+public class JSParseFilter implements HtmlParseFilter, Parser {
+  public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
+
+  private static final int MAX_TITLE_LEN = 80;
+
+  private Configuration conf;
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    String url = content.getBaseUrl();
+    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+    walk(doc, parse, metaTags, url, outlinks);
+    if (outlinks.size() > 0) {
+      Outlink[] old = parse.getData().getOutlinks();
+      String title = parse.getData().getTitle();
+      List<Outlink> list = Arrays.asList(old);
+      outlinks.addAll(list);
+      ParseStatus status = parse.getData().getStatus();
+      String text = parse.getText();
+      Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
+          .size()]);
+      ParseData parseData = new ParseData(status, title, newlinks, parse
+          .getData().getContentMeta(), parse.getData().getParseMeta());
+
+      // replace original parse obj with new one
+      parseResult.put(content.getUrl(), new ParseText(text), parseData);
+    }
+    return parseResult;
+  }
+
+  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
+      List<Outlink> outlinks) {
+    if (n instanceof Element) {
+      String name = n.getNodeName();
+      if (name.equalsIgnoreCase("script")) {
+        /*
+         * String lang = null; Node lNode =
+         * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
+         * "javascript"; else lang = lNode.getNodeValue();
+         */
+        StringBuffer script = new StringBuffer();
+        NodeList nn = n.getChildNodes();
+        if (nn.getLength() > 0) {
+          for (int i = 0; i < nn.getLength(); i++) {
+            if (i > 0)
+              script.append('\n');
+            script.append(nn.item(i).getNodeValue());
+          }
+          // if (LOG.isInfoEnabled()) {
+          // LOG.info("script: language=" + lang + ", text: " +
+          // script.toString());
+          // }
+          Outlink[] links = getJSLinks(script.toString(), "", base);
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
+          // no other children of interest here, go one level up.
+          return;
+        }
+      } else {
+        // process all HTML 4.0 events, if present...
+        NamedNodeMap attrs = n.getAttributes();
+        int len = attrs.getLength();
+        for (int i = 0; i < len; i++) {
+          // Window: onload,onunload
+          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
+          // Keyboard: onkeydown,onkeypress,onkeyup
+          // Mouse:
+          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+          Node anode = attrs.item(i);
+          Outlink[] links = null;
+          if (anode.getNodeName().startsWith("on")) {
+            links = getJSLinks(anode.getNodeValue(), "", base);
+          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
+            String val = anode.getNodeValue();
+            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
+              links = getJSLinks(val, "", base);
+            }
+          }
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
+        }
+      }
+    }
+    NodeList nl = n.getChildNodes();
+    for (int i = 0; i < nl.getLength(); i++) {
+      walk(nl.item(i), parse, metaTags, base, outlinks);
+    }
+  }
+
+  public ParseResult getParse(Content c) {
+    String type = c.getContentType();
+    if (type != null && !type.trim().equals("")
+        && !type.toLowerCase().startsWith("application/x-javascript"))
+      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
+          "Content not JavaScript: '" + type + "'").getEmptyParseResult(
+          c.getUrl(), getConf());
+    String script = new String(c.getContent());
+    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
+    if (outlinks == null)
+      outlinks = new Outlink[0];
+    // Title? use the first line of the script...
+    String title;
+    int idx = script.indexOf('\n');
+    if (idx != -1) {
+      if (idx > MAX_TITLE_LEN)
+        idx = MAX_TITLE_LEN;
+      title = script.substring(0, idx);
+    } else {
+      idx = Math.min(MAX_TITLE_LEN, script.length());
+      title = script.substring(0, idx);
+    }
+    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
+        c.getMetadata());
+    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
+  }
+
+  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
+  // A simple pattern. This allows also invalid URL characters.
+  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+
+  // Alternative pattern, which limits valid url characters.
+  // private static final String URI_PATTERN =
+  // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+
+  /**
+   * This method extracts URLs from literals embedded in JavaScript.
+   */
+  private Outlink[] getJSLinks(String plainText, String anchor, String base) {
+
+    final List<Outlink> outlinks = new ArrayList<Outlink>();
+    URL baseURL = null;
+
+    try {
+      baseURL = new URL(base);
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getJSLinks", e);
+      }
+    }
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(STRING_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final Pattern pattern1 = cp.compile(URI_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcher matcher1 = new Perl5Matcher();
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      // loop the matches
+      while (matcher.contains(input, pattern)) {
+        result = matcher.getMatch();
+        url = result.group(2);
+        PatternMatcherInput input1 = new PatternMatcherInput(url);
+        if (!matcher1.matches(input1, pattern1)) {
+          // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'");
+          // }
+          continue;
+        }
+        if (url.startsWith("www.")) {
+          url = "http://" + url;
+        } else {
+          // See if candidate URL is parseable. If not, pass and move on to
+          // the next match.
+          try {
+            url = new URL(baseURL, url).toString();
+          } catch (MalformedURLException ex) {
+            if (LOG.isTraceEnabled()) {
+              LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
+                  + baseURL + "'", ex);
+            }
+            continue;
+          }
+        }
+        url = url.replaceAll("&amp;", "&");
+        if (LOG.isTraceEnabled()) {
+          LOG.trace(" - outlink from JS: '" + url + "'");
+        }
+        outlinks.add(new Outlink(url, anchor));
+      }
+    } catch (Exception ex) {
+      // if it is a malformed URL we just throw it away and continue with
+      // extraction.
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getJSLinks", ex);
+      }
+    }
+
+    final Outlink[] retval;
+
+    // create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+
+  public static void main(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
+      return;
+    }
+    InputStream in = new FileInputStream(args[0]);
+    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+    StringBuffer sb = new StringBuffer();
+    String line = null;
+    while ((line = br.readLine()) != null)
+      sb.append(line + "\n");
+    br.close();
+
+    JSParseFilter parseFilter = new JSParseFilter();
+    parseFilter.setConf(NutchConfiguration.create());
+    Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
+    System.out.println("Outlinks extracted: " + links.length);
+    for (int i = 0; i < links.length; i++)
+      System.out.println(" - " + links[i]);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java
new file mode 100644
index 0000000..36d0d14
--- /dev/null
+++ b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parser and parse filter plugin to extract all (possible) links
+ * from JavaScript files and embedded JavaScript code snippets.
+ */
+package org.apache.nutch.parse.js;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/README.txt b/nutch-plugins/parse-metatags/README.txt
new file mode 100644
index 0000000..0d5b009
--- /dev/null
+++ b/nutch-plugins/parse-metatags/README.txt
@@ -0,0 +1,17 @@
+Parse-metatags plugin
+
+The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'.
+In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml
+
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+</property>
+
+Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
+  
+This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com
+
+
+
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/build.xml b/nutch-plugins/parse-metatags/build.xml
new file mode 100644
index 0000000..e30292d
--- /dev/null
+++ b/nutch-plugins/parse-metatags/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-metatags" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+	</target>
+
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/ivy.xml b/nutch-plugins/parse-metatags/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-metatags/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/plugin.xml b/nutch-plugins/parse-metatags/plugin.xml
new file mode 100644
index 0000000..07933fa
--- /dev/null
+++ b/nutch-plugins/parse-metatags/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-metatags"
+   name="MetaTags"
+   version="1.0"
+   provider-name="digitalpebble.com">
+
+   <runtime>
+      <library name="parse-metatags.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.metatags.parser"
+              name="MetaTags Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="MetaTagsParser"
+                      class="org.apache.nutch.parse.metatags.MetaTagsParser"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/pom.xml b/nutch-plugins/parse-metatags/pom.xml
new file mode 100644
index 0000000..e96d404
--- /dev/null
+++ b/nutch-plugins/parse-metatags/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-metatags</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-metatags</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/sample/testMetatags.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/sample/testMetatags.html b/nutch-plugins/parse-metatags/sample/testMetatags.html
new file mode 100644
index 0000000..e9e8e6b
--- /dev/null
+++ b/nutch-plugins/parse-metatags/sample/testMetatags.html
@@ -0,0 +1,9 @@
+<html>
+<head>
+<meta name="Keywords" content="This is a test of keywords" />
+<meta name="Description" content="This is a test of description" />
+</head>
+<body>
+text of the document
+</body>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html b/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html
new file mode 100644
index 0000000..ca8b737
--- /dev/null
+++ b/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html
@@ -0,0 +1,12 @@
+<html>
+<head>
+<meta name="DC.creator" content="Doug Cutting">
+<meta name="DC.creator" content="Michael Cafarella">
+<!-- meta keywords in different casing -->
+<meta name="keywords" lang="en" content="web crawler" />
+<meta name="Keywords" lang="fr" content="robot d'indexation" />
+<meta name="KEYWORDS" lang="de" content="Webcrawler" />
+</head>
+<body>
+A test for multi-valued metatags.
+</body>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
new file mode 100644
index 0000000..f9b9722
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.metatags;
+
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Parse HTML meta tags (keywords, description) and store them in the parse
+ * metadata so that they can be indexed with the index-metadata plugin with the
+ * prefix 'metatag.'. Metatags are matched ignoring case.
+ */
+public class MetaTagsParser implements HtmlParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
+      .getName());
+
+  private Configuration conf;
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    // specify whether we want a specific subset of metadata
+    // by default take everything we can find
+    String[] values = conf.getStrings("metatags.names", "*");
+    for (String val : values) {
+      metatagset.add(val.toLowerCase(Locale.ROOT));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Check whether the metatag is in the list of metatags to be indexed (or if
+   * '*' is specified). If yes, add it to parse metadata.
+   */
+  private void addIndexedMetatags(Metadata metadata, String metatag,
+      String value) {
+    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+      }
+      metadata.add("metatag." + lcMetatag, value);
+    }
+  }
+
+  /**
+   * Check whether the metatag is in the list of metatags to be indexed (or if
+   * '*' is specified). If yes, add it with all values to parse metadata.
+   */
+  private void addIndexedMetatags(Metadata metadata, String metatag,
+      String[] values) {
+    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      for (String value : values) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+        }
+        metadata.add("metatag." + lcMetatag, value);
+      }
+    }
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+    Metadata metadata = parse.getData().getParseMeta();
+
+    // check in the metadata first : the tika-parser
+    // might have stored the values there already
+    for (String mdName : metadata.names()) {
+      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
+    }
+
+    Metadata generalMetaTags = metaTags.getGeneralTags();
+    for (String tagName : generalMetaTags.names()) {
+      addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
+    }
+
+    Properties httpequiv = metaTags.getHttpEquivTags();
+    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
+        .hasMoreElements();) {
+      String name = (String) tagNames.nextElement();
+      String value = httpequiv.getProperty(name);
+      addIndexedMetatags(metadata, name, value);
+    }
+
+    return parseResult;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java
new file mode 100644
index 0000000..a55cf5c
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to extract meta tags: keywords, description, etc.
+ * Used in combination with index-metadata plugin
+ * (see {@link org.apache.nutch.indexer.metadata}).
+ */
+package org.apache.nutch.parse.metatags;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
new file mode 100644
index 0000000..024aadf
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.metatags;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMetatagParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testMetatags.html";
+  private String sampleFileMultival = "testMultivalueMetatags.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+
+  @Test
+  /** test multiple metatags resulting in metadata with multiple values */
+  public void testMultiValueMetatags() {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("metatags.names", "keywords,DC.creator");
+    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
+
+    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+    String failMessage = "One value of metatag with multiple values is missing: ";
+
+    Set<String> valueSet = new TreeSet<String>();
+    for (String val : parseMeta.getValues("metatag.dc.creator")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
+    for (String val : expectedValues1) {
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
+    }
+
+    valueSet.clear();
+    for (String val : parseMeta.getValues("metatag.keywords")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues2 = { "robot d'indexation", "web crawler",
+        "Webcrawler" };
+    for (String val : expectedValues2) {
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/README.txt b/nutch-plugins/parse-replace/README.txt
new file mode 100644
index 0000000..a18bd9c
--- /dev/null
+++ b/nutch-plugins/parse-replace/README.txt
@@ -0,0 +1,91 @@
+ParseReplace plugin
+
+Allows post-parsing regexp replace manipulation of metadata fields.
+
+Configuration Example
+    <property>
+      <name>parse.replace.regexp</name>
+      <value>
+        id=/file:/http:/
+        url=/file:/http:/128
+      </value>
+    </property
+
+Property format: parse.replace.regexp
+    The format of the property is a list of regexp replacements, one line per field being
+    modified.  Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+
+    The fieldname preceeds the equal sign.  The first character after the equal sign signifies
+    the delimiter for the regexp, the replacement value and the flags.
+
+Replacement Sequence
+    The replacements will happen in the order listed. If a field needs multiple replacement operations
+    they may be listed more than once.
+
+RegExp Format
+    The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
+    here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
+    Patterns are compiled when the plugin is initialized for efficiency.
+
+Replacement Format
+    The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
+    http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
+
+Flags
+    The flags is an integer sum of the flag values defined in
+    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
+
+Escaping
+    Since the regexp is being read from a config file, any escaped values must be double
+    escaped.  Eg:  id=/\\s+//  will cause the esacped \s+ match pattern to be used.
+
+Multi-valued Fields
+    If a field has multiple values, the replacement will be applied to each value in turn.
+
+Non-string Datatypes
+    Replacement is possible only on String field datatypes.  If the field you name in the property is
+    not a String datatype, it will be silently ignored.
+
+Host and URL specifc replacements.
+    If the replacements should apply only to specifc pages, then add a sequence like
+
+    hostmatch=/host match pattern/
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+    or
+    urlmatch=/url match pattern/
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
+will apply to all parsed pages.  Replacements following a hostmatch or urlmatch will be applied
+to pages which match the host or url field (up to the next hostmatch or urlmatch line).  hostmatch
+and urlmatch patterns must be unique in this property.
+
+Plugin order
+    TBD... But in most cases you will want this plugin to run last.
+
+Testing your match patterns
+    Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
+    can help get the basics of your pattern working.
+    To test in nutch: 
+        Prepare a test HTML file with the field contents you want to test. 
+        Place this in a directory accessible to nutch.
+        Use the file:/// syntax to list the test file(s) in a test/urls seed list.
+        See the nutch faq "index my local file system" for conf settings you will need.
+        (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
+        test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
+        patterns also match the file: URL pattern)
+ 
+    Run..
+        bin/nutch inject crawl/crawldb test
+        bin/nutch generate crawl/crawldb crawl/segments
+        bin/nutch fetch crawl/segments/[segment]
+        bin/nutch parse crawl/segments/[segment]
+
+    To inspect the returned fields...
+        bin/nutch readseg -dump crawl/segments/[segment] testout
+        less testout/dump
+
+    To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/build.xml b/nutch-plugins/parse-replace/build.xml
new file mode 100644
index 0000000..ca5ccf7
--- /dev/null
+++ b/nutch-plugins/parse-replace/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-replace" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+	</target>
+
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/ivy.xml b/nutch-plugins/parse-replace/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-replace/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/plugin.xml b/nutch-plugins/parse-replace/plugin.xml
new file mode 100644
index 0000000..6368210
--- /dev/null
+++ b/nutch-plugins/parse-replace/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-replace"
+   name="ReplaceParser"
+   version="1.0"
+   provider-name="PeterCiuffetti">
+
+   <runtime>
+      <library name="parse-replace.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.replace.parser"
+              name="Replace Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="ReplaceParser"
+                      class="org.apache.nutch.parse.replace.ReplaceParser"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/pom.xml b/nutch-plugins/parse-replace/pom.xml
new file mode 100644
index 0000000..073f895
--- /dev/null
+++ b/nutch-plugins/parse-replace/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-replace</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-replace</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/sample/testParseReplace.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/sample/testParseReplace.html b/nutch-plugins/parse-replace/sample/testParseReplace.html
new file mode 100644
index 0000000..825dcb9
--- /dev/null
+++ b/nutch-plugins/parse-replace/sample/testParseReplace.html
@@ -0,0 +1,11 @@
+<html>
+  <head>
+    <title>Testing the power of parser-replace plugin</title>
+    <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!">
+    <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!">
+    <meta name="author" content="Peter Ciuffetti">
+  </head>
+  <body>
+    <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p>
+  </body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java
new file mode 100644
index 0000000..9773c4a
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.replace;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Do pattern replacements on selected field contents
+ * prior to indexing.
+ */
+public class ReplaceParser implements HtmlParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(ReplaceParser.class
+      .getName());
+
+  private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap();
+  private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap();
+
+  private Configuration conf;
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String[] values = conf.getStrings("parse.replace.regexp", null);
+    if (values != null) {
+      this.parseConf(values);
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  private void parseConf(String[] values) {
+	  
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    return parseResult;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java
new file mode 100644
index 0000000..b678f00
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to allow pattern replacements on parsed metadata.
+ */
+package org.apache.nutch.parse.replace;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
new file mode 100644
index 0000000..593d5ed
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestParseReplace {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testParseReplace.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+}