You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:54 UTC

[10/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
deleted file mode 100644
index 5089a10..0000000
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-
-import org.cyberneko.html.parsers.*;
-import org.junit.Assert;
-import org.junit.Test;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/** Unit tests for HTMLMetaProcessor. */
-public class TestRobotsMetaProcessor {
-
-  /*
-   * 
-   * some sample tags:
-   * 
-   * <meta name="robots" content="index,follow"> <meta name="robots"
-   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
-   * <meta name="robots" content="noindex,nofollow">
-   * 
-   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-   */
-
-  public static String[] tests = {
-      "<html><head><title>test page</title>"
-          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
-          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"all\"> "
-          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
-          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
-          + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"noindex,follow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"index,nofollow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"index,follow\"> "
-          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
-          + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
-          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
-          + " some text" + "</body></html>",
-
-  };
-
-  public static final boolean[][] answers = { { true, true, true }, // NONE
-      { false, false, true }, // all
-      { true, true, true }, // nOnE
-      { true, true, false }, // none
-      { true, true, false }, // noindex,nofollow
-      { true, false, false }, // noindex,follow
-      { false, true, false }, // index,nofollow
-      { false, false, false }, // index,follow
-      { false, false, false }, // missing!
-  };
-
-  private URL[][] currURLsAndAnswers;
-
-  @Test
-  public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser = new DOMFragmentParser();
-    ;
-
-    try {
-      currURLsAndAnswers = new URL[][] {
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org"), null },
-          { new URL("http://www.nutch.org/foo/"),
-              new URL("http://www.nutch.org/") },
-          { new URL("http://www.nutch.org"),
-              new URL("http://www.nutch.org/base/") } };
-    } catch (Exception e) {
-      Assert.assertTrue("couldn't make test URLs!", false);
-    }
-
-    for (int i = 0; i < tests.length; i++) {
-      byte[] bytes = tests[i].getBytes();
-
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-
-      try {
-        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
-      } catch (Exception e) {
-        e.printStackTrace();
-      }
-
-      HTMLMetaTags robotsMeta = new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
-
-      Assert.assertTrue("got index wrong on test " + i,
-          robotsMeta.getNoIndex() == answers[i][0]);
-      Assert.assertTrue("got follow wrong on test " + i,
-          robotsMeta.getNoFollow() == answers[i][1]);
-      Assert.assertTrue("got cache wrong on test " + i,
-          robotsMeta.getNoCache() == answers[i][2]);
-      Assert
-          .assertTrue(
-              "got base href wrong on test " + i + " (got "
-                  + robotsMeta.getBaseHref() + ")",
-              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
-                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
-                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
-
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/build.xml b/src/plugin/parse-js/build.xml
deleted file mode 100644
index d9c2146..0000000
--- a/src/plugin/parse-js/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-js" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/ivy.xml b/src/plugin/parse-js/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-js/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/plugin.xml b/src/plugin/parse-js/plugin.xml
deleted file mode 100644
index 9c06c2a..0000000
--- a/src/plugin/parse-js/plugin.xml
+++ /dev/null
@@ -1,53 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parse-js"
-   name="JavaScript Parser"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parse-js.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.parse.js"
-              name="JS Parser"
-              point="org.apache.nutch.parse.Parser">
-      <implementation id="JSParser"
-         class="org.apache.nutch.parse.js.JSParseFilter">
-        <parameter name="contentType" value="application/x-javascript"/>
-        <parameter name="pathSuffix"  value="js"/>
-      </implementation>
-   </extension>
-   <extension id="org.apache.nutch.parse.js.JSParseFilter"
-              name="Parse JS Filter"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="JSParseFilter"
-         class="org.apache.nutch.parse.js.JSParseFilter">
-        <parameter name="contentType" value="application/x-javascript"/>
-        <parameter name="pathSuffix"  value=""/>
-      </implementation>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
deleted file mode 100644
index 8c95372..0000000
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.js;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseText;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.Element;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-/**
- * This class is a heuristic link extractor for JavaScript files and code
- * snippets. The general idea of a two-pass regex matching comes from Heritrix.
- * Parts of the code come from OutlinkExtractor.java
- */
-public class JSParseFilter implements HtmlParseFilter, Parser {
-  public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
-
-  private static final int MAX_TITLE_LEN = 80;
-
-  private Configuration conf;
-
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    Parse parse = parseResult.get(content.getUrl());
-
-    String url = content.getBaseUrl();
-    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
-    walk(doc, parse, metaTags, url, outlinks);
-    if (outlinks.size() > 0) {
-      Outlink[] old = parse.getData().getOutlinks();
-      String title = parse.getData().getTitle();
-      List<Outlink> list = Arrays.asList(old);
-      outlinks.addAll(list);
-      ParseStatus status = parse.getData().getStatus();
-      String text = parse.getText();
-      Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
-          .size()]);
-      ParseData parseData = new ParseData(status, title, newlinks, parse
-          .getData().getContentMeta(), parse.getData().getParseMeta());
-
-      // replace original parse obj with new one
-      parseResult.put(content.getUrl(), new ParseText(text), parseData);
-    }
-    return parseResult;
-  }
-
-  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
-      List<Outlink> outlinks) {
-    if (n instanceof Element) {
-      String name = n.getNodeName();
-      if (name.equalsIgnoreCase("script")) {
-        /*
-         * String lang = null; Node lNode =
-         * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
-         * "javascript"; else lang = lNode.getNodeValue();
-         */
-        StringBuffer script = new StringBuffer();
-        NodeList nn = n.getChildNodes();
-        if (nn.getLength() > 0) {
-          for (int i = 0; i < nn.getLength(); i++) {
-            if (i > 0)
-              script.append('\n');
-            script.append(nn.item(i).getNodeValue());
-          }
-          // if (LOG.isInfoEnabled()) {
-          // LOG.info("script: language=" + lang + ", text: " +
-          // script.toString());
-          // }
-          Outlink[] links = getJSLinks(script.toString(), "", base);
-          if (links != null && links.length > 0)
-            outlinks.addAll(Arrays.asList(links));
-          // no other children of interest here, go one level up.
-          return;
-        }
-      } else {
-        // process all HTML 4.0 events, if present...
-        NamedNodeMap attrs = n.getAttributes();
-        int len = attrs.getLength();
-        for (int i = 0; i < len; i++) {
-          // Window: onload,onunload
-          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
-          // Keyboard: onkeydown,onkeypress,onkeyup
-          // Mouse:
-          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
-          Node anode = attrs.item(i);
-          Outlink[] links = null;
-          if (anode.getNodeName().startsWith("on")) {
-            links = getJSLinks(anode.getNodeValue(), "", base);
-          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
-            String val = anode.getNodeValue();
-            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
-              links = getJSLinks(val, "", base);
-            }
-          }
-          if (links != null && links.length > 0)
-            outlinks.addAll(Arrays.asList(links));
-        }
-      }
-    }
-    NodeList nl = n.getChildNodes();
-    for (int i = 0; i < nl.getLength(); i++) {
-      walk(nl.item(i), parse, metaTags, base, outlinks);
-    }
-  }
-
-  public ParseResult getParse(Content c) {
-    String type = c.getContentType();
-    if (type != null && !type.trim().equals("")
-        && !type.toLowerCase().startsWith("application/x-javascript"))
-      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
-          "Content not JavaScript: '" + type + "'").getEmptyParseResult(
-          c.getUrl(), getConf());
-    String script = new String(c.getContent());
-    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
-    if (outlinks == null)
-      outlinks = new Outlink[0];
-    // Title? use the first line of the script...
-    String title;
-    int idx = script.indexOf('\n');
-    if (idx != -1) {
-      if (idx > MAX_TITLE_LEN)
-        idx = MAX_TITLE_LEN;
-      title = script.substring(0, idx);
-    } else {
-      idx = Math.min(MAX_TITLE_LEN, script.length());
-      title = script.substring(0, idx);
-    }
-    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
-        c.getMetadata());
-    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
-  }
-
-  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
-  // A simple pattern. This allows also invalid URL characters.
-  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
-
-  // Alternative pattern, which limits valid url characters.
-  // private static final String URI_PATTERN =
-  // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
-
-  /**
-   * This method extracts URLs from literals embedded in JavaScript.
-   */
-  private Outlink[] getJSLinks(String plainText, String anchor, String base) {
-
-    final List<Outlink> outlinks = new ArrayList<Outlink>();
-    URL baseURL = null;
-
-    try {
-      baseURL = new URL(base);
-    } catch (Exception e) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error("getJSLinks", e);
-      }
-    }
-
-    try {
-      final PatternCompiler cp = new Perl5Compiler();
-      final Pattern pattern = cp.compile(STRING_PATTERN,
-          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
-      final Pattern pattern1 = cp.compile(URI_PATTERN,
-          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
-      final PatternMatcher matcher = new Perl5Matcher();
-
-      final PatternMatcher matcher1 = new Perl5Matcher();
-      final PatternMatcherInput input = new PatternMatcherInput(plainText);
-
-      MatchResult result;
-      String url;
-
-      // loop the matches
-      while (matcher.contains(input, pattern)) {
-        result = matcher.getMatch();
-        url = result.group(2);
-        PatternMatcherInput input1 = new PatternMatcherInput(url);
-        if (!matcher1.matches(input1, pattern1)) {
-          // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'");
-          // }
-          continue;
-        }
-        if (url.startsWith("www.")) {
-          url = "http://" + url;
-        } else {
-          // See if candidate URL is parseable. If not, pass and move on to
-          // the next match.
-          try {
-            url = new URL(baseURL, url).toString();
-          } catch (MalformedURLException ex) {
-            if (LOG.isTraceEnabled()) {
-              LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
-                  + baseURL + "'", ex);
-            }
-            continue;
-          }
-        }
-        url = url.replaceAll("&amp;", "&");
-        if (LOG.isTraceEnabled()) {
-          LOG.trace(" - outlink from JS: '" + url + "'");
-        }
-        outlinks.add(new Outlink(url, anchor));
-      }
-    } catch (Exception ex) {
-      // if it is a malformed URL we just throw it away and continue with
-      // extraction.
-      if (LOG.isErrorEnabled()) {
-        LOG.error("getJSLinks", ex);
-      }
-    }
-
-    final Outlink[] retval;
-
-    // create array of the Outlinks
-    if (outlinks != null && outlinks.size() > 0) {
-      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    } else {
-      retval = new Outlink[0];
-    }
-
-    return retval;
-  }
-
-  public static void main(String[] args) throws Exception {
-    if (args.length < 2) {
-      System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
-      return;
-    }
-    InputStream in = new FileInputStream(args[0]);
-    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
-    StringBuffer sb = new StringBuffer();
-    String line = null;
-    while ((line = br.readLine()) != null)
-      sb.append(line + "\n");
-    br.close();
-
-    JSParseFilter parseFilter = new JSParseFilter();
-    parseFilter.setConf(NutchConfiguration.create());
-    Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
-    System.out.println("Outlinks extracted: " + links.length);
-    for (int i = 0; i < links.length; i++)
-      System.out.println(" - " + links[i]);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
deleted file mode 100644
index 36d0d14..0000000
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parser and parse filter plugin to extract all (possible) links
- * from JavaScript files and embedded JavaScript code snippets.
- */
-package org.apache.nutch.parse.js;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/README.txt b/src/plugin/parse-metatags/README.txt
deleted file mode 100644
index 0d5b009..0000000
--- a/src/plugin/parse-metatags/README.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-Parse-metatags plugin
-
-The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'.
-In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml
-
-<property>
-  <name>metatags.names</name>
-  <value>description;keywords</value>
-</property>
-
-Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
-  
-This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com
-
-
-
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/build.xml b/src/plugin/parse-metatags/build.xml
deleted file mode 100644
index e30292d..0000000
--- a/src/plugin/parse-metatags/build.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-metatags" default="jar-core">
-
-	<import file="../build-plugin.xml" />
-
-	<!-- Deploy Unit test dependencies -->
-	<target name="deps-test">
-		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
-		<ant target="deploy" inheritall="false" dir="../protocol-file" />
-	</target>
-
-
-	<!-- for junit test -->
-	<mkdir dir="${build.test}/data" />
-	<copy todir="${build.test}/data">
-		<fileset dir="sample">
-			<include name="*.html" />
-		</fileset>
-	</copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-metatags/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/plugin.xml b/src/plugin/parse-metatags/plugin.xml
deleted file mode 100644
index 07933fa..0000000
--- a/src/plugin/parse-metatags/plugin.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
-   id="parse-metatags"
-   name="MetaTags"
-   version="1.0"
-   provider-name="digitalpebble.com">
-
-   <runtime>
-      <library name="parse-metatags.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <extension id="org.apache.nutch.parse.metatags.parser"
-              name="MetaTags Parser"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="MetaTagsParser"
-                      class="org.apache.nutch.parse.metatags.MetaTagsParser"/>
-   </extension>
-
-</plugin>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/sample/testMetatags.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/sample/testMetatags.html b/src/plugin/parse-metatags/sample/testMetatags.html
deleted file mode 100644
index e9e8e6b..0000000
--- a/src/plugin/parse-metatags/sample/testMetatags.html
+++ /dev/null
@@ -1,9 +0,0 @@
-<html>
-<head>
-<meta name="Keywords" content="This is a test of keywords" />
-<meta name="Description" content="This is a test of description" />
-</head>
-<body>
-text of the document
-</body>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
deleted file mode 100644
index ca8b737..0000000
--- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
+++ /dev/null
@@ -1,12 +0,0 @@
-<html>
-<head>
-<meta name="DC.creator" content="Doug Cutting">
-<meta name="DC.creator" content="Michael Cafarella">
-<!-- meta keywords in different casing -->
-<meta name="keywords" lang="en" content="web crawler" />
-<meta name="Keywords" lang="fr" content="robot d'indexation" />
-<meta name="KEYWORDS" lang="de" content="Webcrawler" />
-</head>
-<body>
-A test for multi-valued metatags.
-</body>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
deleted file mode 100644
index f9b9722..0000000
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.metatags;
-
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Properties;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.w3c.dom.DocumentFragment;
-
-/**
- * Parse HTML meta tags (keywords, description) and store them in the parse
- * metadata so that they can be indexed with the index-metadata plugin with the
- * prefix 'metatag.'. Metatags are matched ignoring case.
- */
-public class MetaTagsParser implements HtmlParseFilter {
-
-  private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
-      .getName());
-
-  private Configuration conf;
-
-  private Set<String> metatagset = new HashSet<String>();
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    // specify whether we want a specific subset of metadata
-    // by default take everything we can find
-    String[] values = conf.getStrings("metatags.names", "*");
-    for (String val : values) {
-      metatagset.add(val.toLowerCase(Locale.ROOT));
-    }
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Check whether the metatag is in the list of metatags to be indexed (or if
-   * '*' is specified). If yes, add it to parse metadata.
-   */
-  private void addIndexedMetatags(Metadata metadata, String metatag,
-      String value) {
-    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
-    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
-      }
-      metadata.add("metatag." + lcMetatag, value);
-    }
-  }
-
-  /**
-   * Check whether the metatag is in the list of metatags to be indexed (or if
-   * '*' is specified). If yes, add it with all values to parse metadata.
-   */
-  private void addIndexedMetatags(Metadata metadata, String metatag,
-      String[] values) {
-    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
-    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
-      for (String value : values) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
-        }
-        metadata.add("metatag." + lcMetatag, value);
-      }
-    }
-  }
-
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    Parse parse = parseResult.get(content.getUrl());
-    Metadata metadata = parse.getData().getParseMeta();
-
-    // check in the metadata first : the tika-parser
-    // might have stored the values there already
-    for (String mdName : metadata.names()) {
-      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
-    }
-
-    Metadata generalMetaTags = metaTags.getGeneralTags();
-    for (String tagName : generalMetaTags.names()) {
-      addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
-    }
-
-    Properties httpequiv = metaTags.getHttpEquivTags();
-    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
-        .hasMoreElements();) {
-      String name = (String) tagNames.nextElement();
-      String value = httpequiv.getProperty(name);
-      addIndexedMetatags(metadata, name, value);
-    }
-
-    return parseResult;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
deleted file mode 100644
index a55cf5c..0000000
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse filter to extract meta tags: keywords, description, etc.
- * Used in combination with index-metadata plugin
- * (see {@link org.apache.nutch.indexer.metadata}).
- */
-package org.apache.nutch.parse.metatags;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
deleted file mode 100644
index 024aadf..0000000
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.metatags;
-
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestMetatagParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  private String sampleDir = System.getProperty("test.data", ".");
-  private String sampleFile = "testMetatags.html";
-  private String sampleFileMultival = "testMultivalueMetatags.html";
-  private String description = "This is a test of description";
-  private String keywords = "This is a test of keywords";
-
-  public Metadata parseMeta(String fileName, Configuration conf) {
-    Metadata metadata = null;
-    try {
-      String urlString = "file:" + sampleDir + fileSeparator + fileName;
-      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      Content content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      metadata = parse.getData().getParseMeta();
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.toString());
-    }
-    return metadata;
-  }
-
-  @Test
-  /** test defaults: keywords and description */
-  public void testIt() {
-    Configuration conf = NutchConfiguration.create();
-
-    // check that we get the same values
-    Metadata parseMeta = parseMeta(sampleFile, conf);
-
-    Assert.assertEquals(description, parseMeta.get("metatag.description"));
-    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
-  }
-
-  @Test
-  /** test multiple metatags resulting in metadata with multiple values */
-  public void testMultiValueMetatags() {
-    Configuration conf = NutchConfiguration.create();
-    conf.set("metatags.names", "keywords,DC.creator");
-    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
-
-    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
-
-    String failMessage = "One value of metatag with multiple values is missing: ";
-
-    Set<String> valueSet = new TreeSet<String>();
-    for (String val : parseMeta.getValues("metatag.dc.creator")) {
-      valueSet.add(val);
-    }
-    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
-    for (String val : expectedValues1) {
-      Assert.assertTrue(failMessage + val, valueSet.contains(val));
-    }
-
-    valueSet.clear();
-    for (String val : parseMeta.getValues("metatag.keywords")) {
-      valueSet.add(val);
-    }
-    String[] expectedValues2 = { "robot d'indexation", "web crawler",
-        "Webcrawler" };
-    for (String val : expectedValues2) {
-      Assert.assertTrue(failMessage + val, valueSet.contains(val));
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/README.txt b/src/plugin/parse-replace/README.txt
deleted file mode 100644
index a18bd9c..0000000
--- a/src/plugin/parse-replace/README.txt
+++ /dev/null
@@ -1,91 +0,0 @@
-ParseReplace plugin
-
-Allows post-parsing regexp replace manipulation of metadata fields.
-
-Configuration Example
-    <property>
-      <name>parse.replace.regexp</name>
-      <value>
-        id=/file:/http:/
-        url=/file:/http:/128
-      </value>
-    </property
-
-Property format: parse.replace.regexp
-    The format of the property is a list of regexp replacements, one line per field being
-    modified.  Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
-
-    The fieldname preceeds the equal sign.  The first character after the equal sign signifies
-    the delimiter for the regexp, the replacement value and the flags.
-
-Replacement Sequence
-    The replacements will happen in the order listed. If a field needs multiple replacement operations
-    they may be listed more than once.
-
-RegExp Format
-    The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
-    here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
-    Patterns are compiled when the plugin is initialized for efficiency.
-
-Replacement Format
-    The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
-    http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
-
-Flags
-    The flags is an integer sum of the flag values defined in
-    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
-
-Escaping
-    Since the regexp is being read from a config file, any escaped values must be double
-    escaped.  Eg:  id=/\\s+//  will cause the esacped \s+ match pattern to be used.
-
-Multi-valued Fields
-    If a field has multiple values, the replacement will be applied to each value in turn.
-
-Non-string Datatypes
-    Replacement is possible only on String field datatypes.  If the field you name in the property is
-    not a String datatype, it will be silently ignored.
-
-Host and URL specifc replacements.
-    If the replacements should apply only to specifc pages, then add a sequence like
-
-    hostmatch=/host match pattern/
-    fld1=/regexp/replace/flags
-    fld2=/regexp/replace/flags
-
-    or
-    urlmatch=/url match pattern/
-    fld1=/regexp/replace/flags
-    fld2=/regexp/replace/flags
-
-When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
-will apply to all parsed pages.  Replacements following a hostmatch or urlmatch will be applied
-to pages which match the host or url field (up to the next hostmatch or urlmatch line).  hostmatch
-and urlmatch patterns must be unique in this property.
-
-Plugin order
-    TBD... But in most cases you will want this plugin to run last.
-
-Testing your match patterns
-    Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
-    can help get the basics of your pattern working.
-    To test in nutch: 
-        Prepare a test HTML file with the field contents you want to test. 
-        Place this in a directory accessible to nutch.
-        Use the file:/// syntax to list the test file(s) in a test/urls seed list.
-        See the nutch faq "index my local file system" for conf settings you will need.
-        (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
-        test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
-        patterns also match the file: URL pattern)
- 
-    Run..
-        bin/nutch inject crawl/crawldb test
-        bin/nutch generate crawl/crawldb crawl/segments
-        bin/nutch fetch crawl/segments/[segment]
-        bin/nutch parse crawl/segments/[segment]
-
-    To inspect the returned fields...
-        bin/nutch readseg -dump crawl/segments/[segment] testout
-        less testout/dump
-
-    To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/build.xml b/src/plugin/parse-replace/build.xml
deleted file mode 100644
index ca5ccf7..0000000
--- a/src/plugin/parse-replace/build.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-replace" default="jar-core">
-
-	<import file="../build-plugin.xml" />
-
-	<!-- Deploy Unit test dependencies -->
-	<target name="deps-test">
-		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
-		<ant target="deploy" inheritall="false" dir="../protocol-file" />
-	</target>
-
-
-	<!-- for junit test -->
-	<mkdir dir="${build.test}/data" />
-	<copy todir="${build.test}/data">
-		<fileset dir="sample">
-			<include name="*.html" />
-		</fileset>
-	</copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/ivy.xml b/src/plugin/parse-replace/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-replace/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/plugin.xml b/src/plugin/parse-replace/plugin.xml
deleted file mode 100644
index 6368210..0000000
--- a/src/plugin/parse-replace/plugin.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
-   id="parse-replace"
-   name="ReplaceParser"
-   version="1.0"
-   provider-name="PeterCiuffetti">
-
-   <runtime>
-      <library name="parse-replace.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <extension id="org.apache.nutch.parse.replace.parser"
-              name="Replace Parser"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="ReplaceParser"
-                      class="org.apache.nutch.parse.replace.ReplaceParser"/>
-   </extension>
-
-</plugin>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/sample/testParseReplace.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/sample/testParseReplace.html b/src/plugin/parse-replace/sample/testParseReplace.html
deleted file mode 100644
index 825dcb9..0000000
--- a/src/plugin/parse-replace/sample/testParseReplace.html
+++ /dev/null
@@ -1,11 +0,0 @@
-<html>
-  <head>
-    <title>Testing the power of parser-replace plugin</title>
-    <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!">
-    <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!">
-    <meta name="author" content="Peter Ciuffetti">
-  </head>
-  <body>
-    <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p>
-  </body>
-</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java b/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
deleted file mode 100644
index 9773c4a..0000000
--- a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.replace;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.w3c.dom.DocumentFragment;
-
-/**
- * Do pattern replacements on selected field contents
- * prior to indexing.
- */
-public class ReplaceParser implements HtmlParseFilter {
-
-  private static final Log LOG = LogFactory.getLog(ReplaceParser.class
-      .getName());
-
-  private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap();
-  private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap();
-
-  private Configuration conf;
-
-  private Set<String> metatagset = new HashSet<String>();
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    String[] values = conf.getStrings("parse.replace.regexp", null);
-    if (values != null) {
-      this.parseConf(values);
-    }
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  private void parseConf(String[] values) {
-	  
-  }
-
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    Parse parse = parseResult.get(content.getUrl());
-
-    return parseResult;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java b/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
deleted file mode 100644
index b678f00..0000000
--- a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse filter to allow pattern replacements on parsed metadata.
- */
-package org.apache.nutch.parse.replace;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
deleted file mode 100644
index 593d5ed..0000000
--- a/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.replace;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestParseReplace {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  private String sampleDir = System.getProperty("test.data", ".");
-  private String sampleFile = "testParseReplace.html";
-  private String description = "This is a test of description";
-  private String keywords = "This is a test of keywords";
-
-  public Metadata parseMeta(String fileName, Configuration conf) {
-    Metadata metadata = null;
-    try {
-      String urlString = "file:" + sampleDir + fileSeparator + fileName;
-      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      Content content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      metadata = parse.getData().getParseMeta();
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.toString());
-    }
-    return metadata;
-  }
-
-  @Test
-  /** test defaults: keywords and description */
-  public void testIt() {
-    Configuration conf = NutchConfiguration.create();
-
-    // check that we get the same values
-    Metadata parseMeta = parseMeta(sampleFile, conf);
-
-    Assert.assertEquals(description, parseMeta.get("metatag.description"));
-    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/build.xml b/src/plugin/parse-swf/build.xml
deleted file mode 100644
index f4fb20f..0000000
--- a/src/plugin/parse-swf/build.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-swf" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
-  </target>
-
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data"/>
-  <copy file="sample/test1.swf" todir="${build.test}/data"/>
-  <copy file="sample/test2.swf" todir="${build.test}/data"/>
-  <copy file="sample/test3.swf" todir="${build.test}/data"/>
-  <copy file="sample/test1.txt" todir="${build.test}/data"/>
-  <copy file="sample/test2.txt" todir="${build.test}/data"/>
-  <copy file="sample/test3.txt" todir="${build.test}/data"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/ivy.xml b/src/plugin/parse-swf/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-swf/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt b/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
deleted file mode 100644
index 4138a66..0000000
--- a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-
-  Copyright (c) 2001-2005, David N. Main, All rights reserved.
-  
-  Redistribution and use in source and binary forms, with or
-  without modification, are permitted provided that the 
-  following conditions are met:
- 
-  1. Redistributions of source code must retain the above 
-  copyright notice, this list of conditions and the following 
-  disclaimer. 
-  
-  2. Redistributions in binary form must reproduce the above 
-  copyright notice, this list of conditions and the following 
-  disclaimer in the documentation and/or other materials 
-  provided with the distribution.
-  
-  3. The name of the author may not be used to endorse or 
-  promote products derived from this software without specific 
-  prior written permission. 
-  
-  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY 
-  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
-  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
-  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
-  AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 
-  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
-  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
-  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
-  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
-  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/lib/javaswf.jar
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/lib/javaswf.jar b/src/plugin/parse-swf/lib/javaswf.jar
deleted file mode 100644
index 78f9b0b..0000000
Binary files a/src/plugin/parse-swf/lib/javaswf.jar and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/plugin.xml b/src/plugin/parse-swf/plugin.xml
deleted file mode 100644
index 8cc72c0..0000000
--- a/src/plugin/parse-swf/plugin.xml
+++ /dev/null
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parse-swf"
-   name="SWF Parse Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="parse-swf.jar">
-         <export name="*"/>
-      </library>
-      <library name="javaswf.jar"/>
-   </runtime>
-
-   <extension id="org.apache.nutch.parse.swf"
-              name="SWFParse"
-              point="org.apache.nutch.parse.Parser">
-
-      <implementation id="org.apache.nutch.parse.swf.SWFParser"
-                      class="org.apache.nutch.parse.swf.SWFParser">
-        <parameter name="contentType" value="application/x-shockwave-flash"/>
-        <parameter name="pathSuffix"  value="swf"/>
-      </implementation>
-      
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test1.swf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test1.swf b/src/plugin/parse-swf/sample/test1.swf
deleted file mode 100644
index cd2019b..0000000
Binary files a/src/plugin/parse-swf/sample/test1.swf and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test1.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test1.txt b/src/plugin/parse-swf/sample/test1.txt
deleted file mode 100644
index 68505d5..0000000
--- a/src/plugin/parse-swf/sample/test1.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-
---------
-/go/gnav_cart
-/go/gnav_company
-/go/gnav_devnet
-/go/gnav_downloads
-/go/gnav_fl_minmessage
-/go/gnav_help
-/go/gnav_mm_home
-/go/gnav_products
-/go/gnav_search?loc=en_us
-/go/gnav_showcase
-/go/gnav_solutions
-/go/gnav_store
-/go/gnav_support
-/go/gnav_your_account
-Acquisition Info
-Adobe Home
-AppleGothic
-Array
-Company
-Developers
-Downloads
-Help
-Home
-International
-LocaleManager
-Macromedia Flash Player
-Macromedia Home
-MovieClip
-Products
-Showcase
-Solutions
-Store
-String
-Support
-TextFormat
-To ensure the best possible Internet Experience, please download the latest version of the free
-Verdana
-_sans
-active
-bluePill
-button
-color
-company
-devnet
-downloads
-en_us
-home
-javascript:openCrosslinkWindow('/go/adobeacquisition')
-javascript:openCrosslinkWindow('/go/gnav_adobe_home')
-products
-rollOut
-rollOver
-selected
-showcase
-solutions
-support
-tabHolder
-textColor

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test2.swf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test2.swf b/src/plugin/parse-swf/sample/test2.swf
deleted file mode 100644
index eb9b03d..0000000
Binary files a/src/plugin/parse-swf/sample/test2.swf and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test2.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test2.txt b/src/plugin/parse-swf/sample/test2.txt
deleted file mode 100644
index f77b78a..0000000
--- a/src/plugin/parse-swf/sample/test2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-Impact Impact Impact  Arial Arial Arial  Webdings Webdings Webdings  Verdana Verdana Verdana  CourierNew CourierNew CourierNew  Bimini Bimini Bimini 
---------
-TextFormat
-color
-font

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test3.swf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test3.swf b/src/plugin/parse-swf/sample/test3.swf
deleted file mode 100644
index 4df9f1e..0000000
Binary files a/src/plugin/parse-swf/sample/test3.swf and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test3.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test3.txt b/src/plugin/parse-swf/sample/test3.txt
deleted file mode 100644
index 66ae3d8..0000000
--- a/src/plugin/parse-swf/sample/test3.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Mix. 
- Edit. 
- Master. 
- Compose. 
- Animate. 
- With a single suite of powerful tools 
- that work together as one. 
- World-class video and audio tools that bring  
- new power and efficiency to your film, video,  
- DVD, and web workflows. 
- Learn more.