You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:54 UTC
[10/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
deleted file mode 100644
index 5089a10..0000000
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.html;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-
-import org.cyberneko.html.parsers.*;
-import org.junit.Assert;
-import org.junit.Test;
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-
-/** Unit tests for HTMLMetaProcessor. */
-public class TestRobotsMetaProcessor {
-
- /*
- *
- * some sample tags:
- *
- * <meta name="robots" content="index,follow"> <meta name="robots"
- * content="noindex,follow"> <meta name="robots" content="index,nofollow">
- * <meta name="robots" content="noindex,nofollow">
- *
- * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
- */
-
- public static String[] tests = {
- "<html><head><title>test page</title>"
- + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
- + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
- + "</head><body>" + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"all\"> "
- + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
- + "</head><body>" + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
- + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
- + "</head><body>" + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
- + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
- + "</head><body>" + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,follow\"> "
- + "</head><body>" + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,nofollow\"> "
- + "</head><body>" + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,follow\"> "
- + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
- + " some text" + "</body></html>",
-
- "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
- + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
- + " some text" + "</body></html>",
-
- };
-
- public static final boolean[][] answers = { { true, true, true }, // NONE
- { false, false, true }, // all
- { true, true, true }, // nOnE
- { true, true, false }, // none
- { true, true, false }, // noindex,nofollow
- { true, false, false }, // noindex,follow
- { false, true, false }, // index,nofollow
- { false, false, false }, // index,follow
- { false, false, false }, // missing!
- };
-
- private URL[][] currURLsAndAnswers;
-
- @Test
- public void testRobotsMetaProcessor() {
- DOMFragmentParser parser = new DOMFragmentParser();
- ;
-
- try {
- currURLsAndAnswers = new URL[][] {
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org"), null },
- { new URL("http://www.nutch.org/foo/"),
- new URL("http://www.nutch.org/") },
- { new URL("http://www.nutch.org"),
- new URL("http://www.nutch.org/base/") } };
- } catch (Exception e) {
- Assert.assertTrue("couldn't make test URLs!", false);
- }
-
- for (int i = 0; i < tests.length; i++) {
- byte[] bytes = tests[i].getBytes();
-
- DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-
- try {
- parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- HTMLMetaTags robotsMeta = new HTMLMetaTags();
- HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
-
- Assert.assertTrue("got index wrong on test " + i,
- robotsMeta.getNoIndex() == answers[i][0]);
- Assert.assertTrue("got follow wrong on test " + i,
- robotsMeta.getNoFollow() == answers[i][1]);
- Assert.assertTrue("got cache wrong on test " + i,
- robotsMeta.getNoCache() == answers[i][2]);
- Assert
- .assertTrue(
- "got base href wrong on test " + i + " (got "
- + robotsMeta.getBaseHref() + ")",
- ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
- || ((robotsMeta.getBaseHref() != null) && robotsMeta
- .getBaseHref().equals(currURLsAndAnswers[i][1])));
-
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/build.xml b/src/plugin/parse-js/build.xml
deleted file mode 100644
index d9c2146..0000000
--- a/src/plugin/parse-js/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-js" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/ivy.xml b/src/plugin/parse-js/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-js/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/plugin.xml b/src/plugin/parse-js/plugin.xml
deleted file mode 100644
index 9c06c2a..0000000
--- a/src/plugin/parse-js/plugin.xml
+++ /dev/null
@@ -1,53 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="parse-js"
- name="JavaScript Parser"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="parse-js.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.parse.js"
- name="JS Parser"
- point="org.apache.nutch.parse.Parser">
- <implementation id="JSParser"
- class="org.apache.nutch.parse.js.JSParseFilter">
- <parameter name="contentType" value="application/x-javascript"/>
- <parameter name="pathSuffix" value="js"/>
- </implementation>
- </extension>
- <extension id="org.apache.nutch.parse.js.JSParseFilter"
- name="Parse JS Filter"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="JSParseFilter"
- class="org.apache.nutch.parse.js.JSParseFilter">
- <parameter name="contentType" value="application/x-javascript"/>
- <parameter name="pathSuffix" value=""/>
- </implementation>
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
deleted file mode 100644
index 8c95372..0000000
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.js;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseText;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.Element;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-/**
- * This class is a heuristic link extractor for JavaScript files and code
- * snippets. The general idea of a two-pass regex matching comes from Heritrix.
- * Parts of the code come from OutlinkExtractor.java
- */
-public class JSParseFilter implements HtmlParseFilter, Parser {
- public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
-
- private static final int MAX_TITLE_LEN = 80;
-
- private Configuration conf;
-
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
-
- Parse parse = parseResult.get(content.getUrl());
-
- String url = content.getBaseUrl();
- ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
- walk(doc, parse, metaTags, url, outlinks);
- if (outlinks.size() > 0) {
- Outlink[] old = parse.getData().getOutlinks();
- String title = parse.getData().getTitle();
- List<Outlink> list = Arrays.asList(old);
- outlinks.addAll(list);
- ParseStatus status = parse.getData().getStatus();
- String text = parse.getText();
- Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
- .size()]);
- ParseData parseData = new ParseData(status, title, newlinks, parse
- .getData().getContentMeta(), parse.getData().getParseMeta());
-
- // replace original parse obj with new one
- parseResult.put(content.getUrl(), new ParseText(text), parseData);
- }
- return parseResult;
- }
-
- private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
- List<Outlink> outlinks) {
- if (n instanceof Element) {
- String name = n.getNodeName();
- if (name.equalsIgnoreCase("script")) {
- /*
- * String lang = null; Node lNode =
- * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
- * "javascript"; else lang = lNode.getNodeValue();
- */
- StringBuffer script = new StringBuffer();
- NodeList nn = n.getChildNodes();
- if (nn.getLength() > 0) {
- for (int i = 0; i < nn.getLength(); i++) {
- if (i > 0)
- script.append('\n');
- script.append(nn.item(i).getNodeValue());
- }
- // if (LOG.isInfoEnabled()) {
- // LOG.info("script: language=" + lang + ", text: " +
- // script.toString());
- // }
- Outlink[] links = getJSLinks(script.toString(), "", base);
- if (links != null && links.length > 0)
- outlinks.addAll(Arrays.asList(links));
- // no other children of interest here, go one level up.
- return;
- }
- } else {
- // process all HTML 4.0 events, if present...
- NamedNodeMap attrs = n.getAttributes();
- int len = attrs.getLength();
- for (int i = 0; i < len; i++) {
- // Window: onload,onunload
- // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
- // Keyboard: onkeydown,onkeypress,onkeyup
- // Mouse:
- // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
- Node anode = attrs.item(i);
- Outlink[] links = null;
- if (anode.getNodeName().startsWith("on")) {
- links = getJSLinks(anode.getNodeValue(), "", base);
- } else if (anode.getNodeName().equalsIgnoreCase("href")) {
- String val = anode.getNodeValue();
- if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
- links = getJSLinks(val, "", base);
- }
- }
- if (links != null && links.length > 0)
- outlinks.addAll(Arrays.asList(links));
- }
- }
- }
- NodeList nl = n.getChildNodes();
- for (int i = 0; i < nl.getLength(); i++) {
- walk(nl.item(i), parse, metaTags, base, outlinks);
- }
- }
-
- public ParseResult getParse(Content c) {
- String type = c.getContentType();
- if (type != null && !type.trim().equals("")
- && !type.toLowerCase().startsWith("application/x-javascript"))
- return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
- "Content not JavaScript: '" + type + "'").getEmptyParseResult(
- c.getUrl(), getConf());
- String script = new String(c.getContent());
- Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
- if (outlinks == null)
- outlinks = new Outlink[0];
- // Title? use the first line of the script...
- String title;
- int idx = script.indexOf('\n');
- if (idx != -1) {
- if (idx > MAX_TITLE_LEN)
- idx = MAX_TITLE_LEN;
- title = script.substring(0, idx);
- } else {
- idx = Math.min(MAX_TITLE_LEN, script.length());
- title = script.substring(0, idx);
- }
- ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
- c.getMetadata());
- return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
- }
-
- private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
- // A simple pattern. This allows also invalid URL characters.
- private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
-
- // Alternative pattern, which limits valid url characters.
- // private static final String URI_PATTERN =
- // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
-
- /**
- * This method extracts URLs from literals embedded in JavaScript.
- */
- private Outlink[] getJSLinks(String plainText, String anchor, String base) {
-
- final List<Outlink> outlinks = new ArrayList<Outlink>();
- URL baseURL = null;
-
- try {
- baseURL = new URL(base);
- } catch (Exception e) {
- if (LOG.isErrorEnabled()) {
- LOG.error("getJSLinks", e);
- }
- }
-
- try {
- final PatternCompiler cp = new Perl5Compiler();
- final Pattern pattern = cp.compile(STRING_PATTERN,
- Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
- | Perl5Compiler.MULTILINE_MASK);
- final Pattern pattern1 = cp.compile(URI_PATTERN,
- Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
- | Perl5Compiler.MULTILINE_MASK);
- final PatternMatcher matcher = new Perl5Matcher();
-
- final PatternMatcher matcher1 = new Perl5Matcher();
- final PatternMatcherInput input = new PatternMatcherInput(plainText);
-
- MatchResult result;
- String url;
-
- // loop the matches
- while (matcher.contains(input, pattern)) {
- result = matcher.getMatch();
- url = result.group(2);
- PatternMatcherInput input1 = new PatternMatcherInput(url);
- if (!matcher1.matches(input1, pattern1)) {
- // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'");
- // }
- continue;
- }
- if (url.startsWith("www.")) {
- url = "http://" + url;
- } else {
- // See if candidate URL is parseable. If not, pass and move on to
- // the next match.
- try {
- url = new URL(baseURL, url).toString();
- } catch (MalformedURLException ex) {
- if (LOG.isTraceEnabled()) {
- LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
- + baseURL + "'", ex);
- }
- continue;
- }
- }
- url = url.replaceAll("&", "&");
- if (LOG.isTraceEnabled()) {
- LOG.trace(" - outlink from JS: '" + url + "'");
- }
- outlinks.add(new Outlink(url, anchor));
- }
- } catch (Exception ex) {
- // if it is a malformed URL we just throw it away and continue with
- // extraction.
- if (LOG.isErrorEnabled()) {
- LOG.error("getJSLinks", ex);
- }
- }
-
- final Outlink[] retval;
-
- // create array of the Outlinks
- if (outlinks != null && outlinks.size() > 0) {
- retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
- } else {
- retval = new Outlink[0];
- }
-
- return retval;
- }
-
- public static void main(String[] args) throws Exception {
- if (args.length < 2) {
- System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
- return;
- }
- InputStream in = new FileInputStream(args[0]);
- BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
- StringBuffer sb = new StringBuffer();
- String line = null;
- while ((line = br.readLine()) != null)
- sb.append(line + "\n");
- br.close();
-
- JSParseFilter parseFilter = new JSParseFilter();
- parseFilter.setConf(NutchConfiguration.create());
- Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
- System.out.println("Outlinks extracted: " + links.length);
- for (int i = 0; i < links.length; i++)
- System.out.println(" - " + links[i]);
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
deleted file mode 100644
index 36d0d14..0000000
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parser and parse filter plugin to extract all (possible) links
- * from JavaScript files and embedded JavaScript code snippets.
- */
-package org.apache.nutch.parse.js;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/README.txt b/src/plugin/parse-metatags/README.txt
deleted file mode 100644
index 0d5b009..0000000
--- a/src/plugin/parse-metatags/README.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-Parse-metatags plugin
-
-The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'.
-In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml
-
-<property>
- <name>metatags.names</name>
- <value>description;keywords</value>
-</property>
-
-Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
-
-This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com
-
-
-
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/build.xml b/src/plugin/parse-metatags/build.xml
deleted file mode 100644
index e30292d..0000000
--- a/src/plugin/parse-metatags/build.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-metatags" default="jar-core">
-
- <import file="../build-plugin.xml" />
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
- <ant target="deploy" inheritall="false" dir="../protocol-file" />
- </target>
-
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="sample">
- <include name="*.html" />
- </fileset>
- </copy>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-metatags/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/plugin.xml b/src/plugin/parse-metatags/plugin.xml
deleted file mode 100644
index 07933fa..0000000
--- a/src/plugin/parse-metatags/plugin.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
- id="parse-metatags"
- name="MetaTags"
- version="1.0"
- provider-name="digitalpebble.com">
-
- <runtime>
- <library name="parse-metatags.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <extension id="org.apache.nutch.parse.metatags.parser"
- name="MetaTags Parser"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="MetaTagsParser"
- class="org.apache.nutch.parse.metatags.MetaTagsParser"/>
- </extension>
-
-</plugin>
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/sample/testMetatags.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/sample/testMetatags.html b/src/plugin/parse-metatags/sample/testMetatags.html
deleted file mode 100644
index e9e8e6b..0000000
--- a/src/plugin/parse-metatags/sample/testMetatags.html
+++ /dev/null
@@ -1,9 +0,0 @@
-<html>
-<head>
-<meta name="Keywords" content="This is a test of keywords" />
-<meta name="Description" content="This is a test of description" />
-</head>
-<body>
-text of the document
-</body>
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
deleted file mode 100644
index ca8b737..0000000
--- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
+++ /dev/null
@@ -1,12 +0,0 @@
-<html>
-<head>
-<meta name="DC.creator" content="Doug Cutting">
-<meta name="DC.creator" content="Michael Cafarella">
-<!-- meta keywords in different casing -->
-<meta name="keywords" lang="en" content="web crawler" />
-<meta name="Keywords" lang="fr" content="robot d'indexation" />
-<meta name="KEYWORDS" lang="de" content="Webcrawler" />
-</head>
-<body>
-A test for multi-valued metatags.
-</body>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
deleted file mode 100644
index f9b9722..0000000
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.metatags;
-
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Properties;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.w3c.dom.DocumentFragment;
-
-/**
- * Parse HTML meta tags (keywords, description) and store them in the parse
- * metadata so that they can be indexed with the index-metadata plugin with the
- * prefix 'metatag.'. Metatags are matched ignoring case.
- */
-public class MetaTagsParser implements HtmlParseFilter {
-
- private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
- .getName());
-
- private Configuration conf;
-
- private Set<String> metatagset = new HashSet<String>();
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- // specify whether we want a specific subset of metadata
- // by default take everything we can find
- String[] values = conf.getStrings("metatags.names", "*");
- for (String val : values) {
- metatagset.add(val.toLowerCase(Locale.ROOT));
- }
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * Check whether the metatag is in the list of metatags to be indexed (or if
- * '*' is specified). If yes, add it to parse metadata.
- */
- private void addIndexedMetatags(Metadata metadata, String metatag,
- String value) {
- String lcMetatag = metatag.toLowerCase(Locale.ROOT);
- if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
- }
- metadata.add("metatag." + lcMetatag, value);
- }
- }
-
- /**
- * Check whether the metatag is in the list of metatags to be indexed (or if
- * '*' is specified). If yes, add it with all values to parse metadata.
- */
- private void addIndexedMetatags(Metadata metadata, String metatag,
- String[] values) {
- String lcMetatag = metatag.toLowerCase(Locale.ROOT);
- if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
- for (String value : values) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
- }
- metadata.add("metatag." + lcMetatag, value);
- }
- }
- }
-
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
-
- Parse parse = parseResult.get(content.getUrl());
- Metadata metadata = parse.getData().getParseMeta();
-
- // check in the metadata first : the tika-parser
- // might have stored the values there already
- for (String mdName : metadata.names()) {
- addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
- }
-
- Metadata generalMetaTags = metaTags.getGeneralTags();
- for (String tagName : generalMetaTags.names()) {
- addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
- }
-
- Properties httpequiv = metaTags.getHttpEquivTags();
- for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
- .hasMoreElements();) {
- String name = (String) tagNames.nextElement();
- String value = httpequiv.getProperty(name);
- addIndexedMetatags(metadata, name, value);
- }
-
- return parseResult;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
deleted file mode 100644
index a55cf5c..0000000
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse filter to extract meta tags: keywords, description, etc.
- * Used in combination with index-metadata plugin
- * (see {@link org.apache.nutch.indexer.metadata}).
- */
-package org.apache.nutch.parse.metatags;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
deleted file mode 100644
index 024aadf..0000000
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.metatags;
-
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestMetatagParser {
-
- private String fileSeparator = System.getProperty("file.separator");
- private String sampleDir = System.getProperty("test.data", ".");
- private String sampleFile = "testMetatags.html";
- private String sampleFileMultival = "testMultivalueMetatags.html";
- private String description = "This is a test of description";
- private String keywords = "This is a test of keywords";
-
- public Metadata parseMeta(String fileName, Configuration conf) {
- Metadata metadata = null;
- try {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
- metadata = parse.getData().getParseMeta();
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.toString());
- }
- return metadata;
- }
-
- @Test
- /** test defaults: keywords and description */
- public void testIt() {
- Configuration conf = NutchConfiguration.create();
-
- // check that we get the same values
- Metadata parseMeta = parseMeta(sampleFile, conf);
-
- Assert.assertEquals(description, parseMeta.get("metatag.description"));
- Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
- }
-
- @Test
- /** test multiple metatags resulting in metadata with multiple values */
- public void testMultiValueMetatags() {
- Configuration conf = NutchConfiguration.create();
- conf.set("metatags.names", "keywords,DC.creator");
- conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
-
- Metadata parseMeta = parseMeta(sampleFileMultival, conf);
-
- String failMessage = "One value of metatag with multiple values is missing: ";
-
- Set<String> valueSet = new TreeSet<String>();
- for (String val : parseMeta.getValues("metatag.dc.creator")) {
- valueSet.add(val);
- }
- String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
- for (String val : expectedValues1) {
- Assert.assertTrue(failMessage + val, valueSet.contains(val));
- }
-
- valueSet.clear();
- for (String val : parseMeta.getValues("metatag.keywords")) {
- valueSet.add(val);
- }
- String[] expectedValues2 = { "robot d'indexation", "web crawler",
- "Webcrawler" };
- for (String val : expectedValues2) {
- Assert.assertTrue(failMessage + val, valueSet.contains(val));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/README.txt b/src/plugin/parse-replace/README.txt
deleted file mode 100644
index a18bd9c..0000000
--- a/src/plugin/parse-replace/README.txt
+++ /dev/null
@@ -1,91 +0,0 @@
-ParseReplace plugin
-
-Allows post-parsing regexp replace manipulation of metadata fields.
-
-Configuration Example
- <property>
- <name>parse.replace.regexp</name>
- <value>
- id=/file:/http:/
- url=/file:/http:/128
- </value>
- </property
-
-Property format: parse.replace.regexp
- The format of the property is a list of regexp replacements, one line per field being
- modified. Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
-
- The fieldname preceeds the equal sign. The first character after the equal sign signifies
- the delimiter for the regexp, the replacement value and the flags.
-
-Replacement Sequence
- The replacements will happen in the order listed. If a field needs multiple replacement operations
- they may be listed more than once.
-
-RegExp Format
- The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
- here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
- Patterns are compiled when the plugin is initialized for efficiency.
-
-Replacement Format
- The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
- http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
-
-Flags
- The flags is an integer sum of the flag values defined in
- http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
-
-Escaping
- Since the regexp is being read from a config file, any escaped values must be double
- escaped. Eg: id=/\\s+// will cause the esacped \s+ match pattern to be used.
-
-Multi-valued Fields
- If a field has multiple values, the replacement will be applied to each value in turn.
-
-Non-string Datatypes
- Replacement is possible only on String field datatypes. If the field you name in the property is
- not a String datatype, it will be silently ignored.
-
-Host and URL specifc replacements.
- If the replacements should apply only to specifc pages, then add a sequence like
-
- hostmatch=/host match pattern/
- fld1=/regexp/replace/flags
- fld2=/regexp/replace/flags
-
- or
- urlmatch=/url match pattern/
- fld1=/regexp/replace/flags
- fld2=/regexp/replace/flags
-
-When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
-will apply to all parsed pages. Replacements following a hostmatch or urlmatch will be applied
-to pages which match the host or url field (up to the next hostmatch or urlmatch line). hostmatch
-and urlmatch patterns must be unique in this property.
-
-Plugin order
- TBD... But in most cases you will want this plugin to run last.
-
-Testing your match patterns
- Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
- can help get the basics of your pattern working.
- To test in nutch:
- Prepare a test HTML file with the field contents you want to test.
- Place this in a directory accessible to nutch.
- Use the file:/// syntax to list the test file(s) in a test/urls seed list.
- See the nutch faq "index my local file system" for conf settings you will need.
- (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
- test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
- patterns also match the file: URL pattern)
-
- Run..
- bin/nutch inject crawl/crawldb test
- bin/nutch generate crawl/crawldb crawl/segments
- bin/nutch fetch crawl/segments/[segment]
- bin/nutch parse crawl/segments/[segment]
-
- To inspect the returned fields...
- bin/nutch readseg -dump crawl/segments/[segment] testout
- less testout/dump
-
- To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step.
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/build.xml b/src/plugin/parse-replace/build.xml
deleted file mode 100644
index ca5ccf7..0000000
--- a/src/plugin/parse-replace/build.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-replace" default="jar-core">
-
- <import file="../build-plugin.xml" />
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
- <ant target="deploy" inheritall="false" dir="../protocol-file" />
- </target>
-
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="sample">
- <include name="*.html" />
- </fileset>
- </copy>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/ivy.xml b/src/plugin/parse-replace/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-replace/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/plugin.xml b/src/plugin/parse-replace/plugin.xml
deleted file mode 100644
index 6368210..0000000
--- a/src/plugin/parse-replace/plugin.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<plugin
- id="parse-replace"
- name="ReplaceParser"
- version="1.0"
- provider-name="PeterCiuffetti">
-
- <runtime>
- <library name="parse-replace.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <extension id="org.apache.nutch.parse.replace.parser"
- name="Replace Parser"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="ReplaceParser"
- class="org.apache.nutch.parse.replace.ReplaceParser"/>
- </extension>
-
-</plugin>
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/sample/testParseReplace.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/sample/testParseReplace.html b/src/plugin/parse-replace/sample/testParseReplace.html
deleted file mode 100644
index 825dcb9..0000000
--- a/src/plugin/parse-replace/sample/testParseReplace.html
+++ /dev/null
@@ -1,11 +0,0 @@
-<html>
- <head>
- <title>Testing the power of parser-replace plugin</title>
- <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!">
- <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!">
- <meta name="author" content="Peter Ciuffetti">
- </head>
- <body>
- <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p>
- </body>
-</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java b/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
deleted file mode 100644
index 9773c4a..0000000
--- a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.replace;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.w3c.dom.DocumentFragment;
-
-/**
- * Do pattern replacements on selected field contents
- * prior to indexing.
- */
-public class ReplaceParser implements HtmlParseFilter {
-
- private static final Log LOG = LogFactory.getLog(ReplaceParser.class
- .getName());
-
- private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap();
- private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap();
-
- private Configuration conf;
-
- private Set<String> metatagset = new HashSet<String>();
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- String[] values = conf.getStrings("parse.replace.regexp", null);
- if (values != null) {
- this.parseConf(values);
- }
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- private void parseConf(String[] values) {
-
- }
-
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
-
- Parse parse = parseResult.get(content.getUrl());
-
- return parseResult;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java b/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
deleted file mode 100644
index b678f00..0000000
--- a/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse filter to allow pattern replacements on parsed metadata.
- */
-package org.apache.nutch.parse.replace;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
deleted file mode 100644
index 593d5ed..0000000
--- a/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.replace;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestParseReplace {
-
- private String fileSeparator = System.getProperty("file.separator");
- private String sampleDir = System.getProperty("test.data", ".");
- private String sampleFile = "testParseReplace.html";
- private String description = "This is a test of description";
- private String keywords = "This is a test of keywords";
-
- public Metadata parseMeta(String fileName, Configuration conf) {
- Metadata metadata = null;
- try {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
- metadata = parse.getData().getParseMeta();
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.toString());
- }
- return metadata;
- }
-
- @Test
- /** test defaults: keywords and description */
- public void testIt() {
- Configuration conf = NutchConfiguration.create();
-
- // check that we get the same values
- Metadata parseMeta = parseMeta(sampleFile, conf);
-
- Assert.assertEquals(description, parseMeta.get("metatag.description"));
- Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/build.xml b/src/plugin/parse-swf/build.xml
deleted file mode 100644
index f4fb20f..0000000
--- a/src/plugin/parse-swf/build.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-swf" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <ant target="deploy" inheritall="false" dir="../protocol-file"/>
- </target>
-
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data"/>
- <copy file="sample/test1.swf" todir="${build.test}/data"/>
- <copy file="sample/test2.swf" todir="${build.test}/data"/>
- <copy file="sample/test3.swf" todir="${build.test}/data"/>
- <copy file="sample/test1.txt" todir="${build.test}/data"/>
- <copy file="sample/test2.txt" todir="${build.test}/data"/>
- <copy file="sample/test3.txt" todir="${build.test}/data"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/ivy.xml b/src/plugin/parse-swf/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-swf/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt b/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
deleted file mode 100644
index 4138a66..0000000
--- a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-
- Copyright (c) 2001-2005, David N. Main, All rights reserved.
-
- Redistribution and use in source and binary forms, with or
- without modification, are permitted provided that the
- following conditions are met:
-
- 1. Redistributions of source code must retain the above
- copyright notice, this list of conditions and the following
- disclaimer.
-
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- 3. The name of the author may not be used to endorse or
- promote products derived from this software without specific
- prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/lib/javaswf.jar
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/lib/javaswf.jar b/src/plugin/parse-swf/lib/javaswf.jar
deleted file mode 100644
index 78f9b0b..0000000
Binary files a/src/plugin/parse-swf/lib/javaswf.jar and /dev/null differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/plugin.xml b/src/plugin/parse-swf/plugin.xml
deleted file mode 100644
index 8cc72c0..0000000
--- a/src/plugin/parse-swf/plugin.xml
+++ /dev/null
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="parse-swf"
- name="SWF Parse Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="parse-swf.jar">
- <export name="*"/>
- </library>
- <library name="javaswf.jar"/>
- </runtime>
-
- <extension id="org.apache.nutch.parse.swf"
- name="SWFParse"
- point="org.apache.nutch.parse.Parser">
-
- <implementation id="org.apache.nutch.parse.swf.SWFParser"
- class="org.apache.nutch.parse.swf.SWFParser">
- <parameter name="contentType" value="application/x-shockwave-flash"/>
- <parameter name="pathSuffix" value="swf"/>
- </implementation>
-
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test1.swf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test1.swf b/src/plugin/parse-swf/sample/test1.swf
deleted file mode 100644
index cd2019b..0000000
Binary files a/src/plugin/parse-swf/sample/test1.swf and /dev/null differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test1.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test1.txt b/src/plugin/parse-swf/sample/test1.txt
deleted file mode 100644
index 68505d5..0000000
--- a/src/plugin/parse-swf/sample/test1.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-
---------
-/go/gnav_cart
-/go/gnav_company
-/go/gnav_devnet
-/go/gnav_downloads
-/go/gnav_fl_minmessage
-/go/gnav_help
-/go/gnav_mm_home
-/go/gnav_products
-/go/gnav_search?loc=en_us
-/go/gnav_showcase
-/go/gnav_solutions
-/go/gnav_store
-/go/gnav_support
-/go/gnav_your_account
-Acquisition Info
-Adobe Home
-AppleGothic
-Array
-Company
-Developers
-Downloads
-Help
-Home
-International
-LocaleManager
-Macromedia Flash Player
-Macromedia Home
-MovieClip
-Products
-Showcase
-Solutions
-Store
-String
-Support
-TextFormat
-To ensure the best possible Internet Experience, please download the latest version of the free
-Verdana
-_sans
-active
-bluePill
-button
-color
-company
-devnet
-downloads
-en_us
-home
-javascript:openCrosslinkWindow('/go/adobeacquisition')
-javascript:openCrosslinkWindow('/go/gnav_adobe_home')
-products
-rollOut
-rollOver
-selected
-showcase
-solutions
-support
-tabHolder
-textColor
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test2.swf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test2.swf b/src/plugin/parse-swf/sample/test2.swf
deleted file mode 100644
index eb9b03d..0000000
Binary files a/src/plugin/parse-swf/sample/test2.swf and /dev/null differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test2.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test2.txt b/src/plugin/parse-swf/sample/test2.txt
deleted file mode 100644
index f77b78a..0000000
--- a/src/plugin/parse-swf/sample/test2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini
---------
-TextFormat
-color
-font
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test3.swf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test3.swf b/src/plugin/parse-swf/sample/test3.swf
deleted file mode 100644
index 4df9f1e..0000000
Binary files a/src/plugin/parse-swf/sample/test3.swf and /dev/null differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/sample/test3.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/sample/test3.txt b/src/plugin/parse-swf/sample/test3.txt
deleted file mode 100644
index 66ae3d8..0000000
--- a/src/plugin/parse-swf/sample/test3.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Mix.
- Edit.
- Master.
- Compose.
- Animate.
- With a single suite of powerful tools
- that work together as one.
- World-class video and audio tools that bring
- new power and efficiency to your film, video,
- DVD, and web workflows.
- Learn more.