You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:09 UTC
[25/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
new file mode 100644
index 0000000..2988114
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Filters URLs based on a file of regular expressions using the
+ * {@link java.util.regex Java Regex implementation}.
+ */
+public class RegexURLFilter extends RegexURLFilterBase {
+
+ public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
+ public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
+
+ public RegexURLFilter() {
+ super();
+ }
+
+ public RegexURLFilter(String filename) throws IOException,
+ PatternSyntaxException {
+ super(filename);
+ }
+
+ RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException {
+ super(reader);
+ }
+
+ /*
+ * ----------------------------------- * <implementation:RegexURLFilterBase> *
+ * -----------------------------------
+ */
+
+ /**
+ * Rules specified as a config property will override rules specified as a
+ * config file.
+ */
+ protected Reader getRulesReader(Configuration conf) throws IOException {
+ String stringRules = conf.get(URLFILTER_REGEX_RULES);
+ if (stringRules != null) {
+ return new StringReader(stringRules);
+ }
+ String fileRules = conf.get(URLFILTER_REGEX_FILE);
+ return conf.getConfResourceAsReader(fileRules);
+ }
+
+ // Inherited Javadoc
+ protected RegexRule createRule(boolean sign, String regex) {
+ return new Rule(sign, regex);
+ }
+
+ protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+ return new Rule(sign, regex, hostOrDomain);
+ }
+
+
+
+ /*
+ * ------------------------------------ * </implementation:RegexURLFilterBase>
+ * * ------------------------------------
+ */
+
+ public static void main(String args[]) throws IOException {
+ RegexURLFilter filter = new RegexURLFilter();
+ filter.setConf(NutchConfiguration.create());
+ main(filter, args);
+ }
+
+ private class Rule extends RegexRule {
+
+ private Pattern pattern;
+
+ Rule(boolean sign, String regex) {
+ this(sign, regex, null);
+ }
+
+ Rule(boolean sign, String regex, String hostOrDomain) {
+ super(sign, regex, hostOrDomain);
+ pattern = Pattern.compile(regex);
+ }
+
+ protected boolean match(String url) {
+ return pattern.matcher(url).find();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
new file mode 100644
index 0000000..7acf73b
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
new file mode 100644
index 0000000..b86181e
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ *
+ * @author Jérôme Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+
+ protected URLFilter getURLFilter(Reader rules) {
+ try {
+ return new RegexURLFilter(rules);
+ } catch (IOException e) {
+ Assert.fail(e.toString());
+ return null;
+ }
+ }
+
+ @Test
+ public void test() {
+ test("WholeWebCrawling");
+ test("IntranetCrawling");
+ bench(50, "Benchmarks");
+ bench(100, "Benchmarks");
+ bench(200, "Benchmarks");
+ bench(400, "Benchmarks");
+ bench(800, "Benchmarks");
+ }
+
+ @Test
+ public void test1838() {
+ test("nutch1838");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/build.xml b/nutch-plugins/urlfilter-suffix/build.xml
new file mode 100644
index 0000000..e5382c6
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-suffix" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/ivy.xml b/nutch-plugins/urlfilter-suffix/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/plugin.xml b/nutch-plugins/urlfilter-suffix/plugin.xml
new file mode 100644
index 0000000..f326d15
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-suffix"
+ name="Suffix URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-suffix.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.suffix"
+ name="Nutch Suffix URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="SuffixURLFilter"
+ class="org.apache.nutch.urlfilter.suffix.SuffixURLFilter"/>
+ <!-- by default, attribute "file" is undefined, to keep classic behavior.
+ <implementation id="SuffixURLFilter"
+ class="org.apache.nutch.net.SuffixURLFilter">
+ <parameter name="file" value="urlfilter-suffix.txt"/>
+ </implementation>
+ -->
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/pom.xml b/nutch-plugins/urlfilter-suffix/pom.xml
new file mode 100644
index 0000000..82023c6
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-suffix</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-suffix</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
new file mode 100644
index 0000000..39c541f
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -0,0 +1,331 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.urlfilter.suffix;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.SuffixStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+
+import java.util.List;
+import java.util.ArrayList;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+
+/**
+ * Filters URLs based on a file of URL suffixes. The file is named by
+ * <ol>
+ * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li>
+ * <li>attribute "file" in plugin.xml of this plugin</li>
+ * </ol>
+ * Attribute "file" has higher precedence if defined. If the config file is
+ * missing, all URLs will be rejected.
+ *
+ * <p>
+ * This filter can be configured to work in one of two modes:
+ * <ul>
+ * <li><b>default to reject</b> ('-'): in this mode, only URLs that match
+ * suffixes specified in the config file will be accepted, all other URLs will
+ * be rejected.</li>
+ * <li><b>default to accept</b> ('+'): in this mode, only URLs that match
+ * suffixes specified in the config file will be rejected, all other URLs will
+ * be accepted.</li>
+ * </ul>
+ * <p>
+ * The format of this config file is one URL suffix per line, with no preceding
+ * whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+ * lines and comments (#) are allowed.
+ * </p>
+ * <p>
+ * A single '+' or '-' sign not followed by any suffix must be used once, to
+ * signify the mode this plugin operates in. An optional single 'I' can be
+ * appended, to signify that suffix matches should be case-insensitive. The
+ * default, if not specified, is to use case-sensitive matches, i.e. suffix
+ * '.JPG' does not match '.jpg'.
+ * </p>
+ * <p>
+ * NOTE: the format of this file is different from urlfilter-prefix, because
+ * that plugin doesn't support allowed/prohibited prefixes (only supports
+ * allowed prefixes). Please note that this plugin does not support regular
+ * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most
+ * probably wrong, you should use "+.jpg" instead.
+ * </p>
+ * <h4>Example 1</h4>
+ * <p>
+ * The configuration shown below will accept all URLs with '.html' or '.htm'
+ * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
+ * all other suffixes.
+ * <p>
+ *
+ * <pre>
+ * # this is a comment
+ *
+ * # prohibit all unknown, case-sensitive matching
+ * -
+ *
+ * # collect only HTML files.
+ * .html
+ * .htm
+ * </pre>
+ *
+ * </p>
+ * <h4>Example 2</h4>
+ * <p>
+ * The configuration shown below will accept all URLs except common graphical
+ * formats.
+ * <p>
+ *
+ * <pre>
+ * # this is a comment
+ *
+ * # allow all unknown, case-insensitive matching
+ * +I
+ *
+ * # prohibited suffixes
+ * .gif
+ * .png
+ * .jpg
+ * .jpeg
+ * .bmp
+ * </pre>
+ *
+ * </p>
+ *
+ * @author Andrzej Bialecki
+ */
+public class SuffixURLFilter implements URLFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(SuffixURLFilter.class);
+
+ // read in attribute "file" of this plugin.
+ private String attributeFile = null;
+
+ private SuffixStringMatcher suffixes;
+ private boolean modeAccept = false;
+ private boolean filterFromPath = false;
+ private boolean ignoreCase = false;
+
+ private Configuration conf;
+
+ public SuffixURLFilter() throws IOException {
+
+ }
+
+ public SuffixURLFilter(Reader reader) throws IOException {
+ readConfiguration(reader);
+ }
+
+ public String filter(String url) {
+ if (url == null)
+ return null;
+ String _url;
+ if (ignoreCase)
+ _url = url.toLowerCase();
+ else
+ _url = url;
+ if (filterFromPath) {
+ try {
+ URL pUrl = new URL(_url);
+ _url = pUrl.getPath();
+ } catch (MalformedURLException e) {
+ // don't care
+ }
+ }
+
+ String a = suffixes.shortestMatch(_url);
+ if (a == null) {
+ if (modeAccept)
+ return url;
+ else
+ return null;
+ } else {
+ if (modeAccept)
+ return null;
+ else
+ return url;
+ }
+ }
+
+ public void readConfiguration(Reader reader) throws IOException {
+
+ // handle missing config file
+ if (reader == null) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
+ }
+ suffixes = new SuffixStringMatcher(new String[0]);
+ modeAccept = false;
+ ignoreCase = false;
+ return;
+ }
+ BufferedReader in = new BufferedReader(reader);
+ List<String> aSuffixes = new ArrayList<String>();
+ boolean allow = false;
+ boolean ignore = false;
+ String line;
+
+ while ((line = in.readLine()) != null) {
+ line = line.trim();
+ if (line.length() == 0)
+ continue;
+
+ char first = line.charAt(0);
+ switch (first) {
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ break;
+ case '-':
+ allow = false;
+ if (line.contains("P"))
+ filterFromPath = true;
+ if (line.contains("I"))
+ ignore = true;
+ break;
+ case '+':
+ allow = true;
+ if (line.contains("P"))
+ filterFromPath = true;
+ if (line.contains("I"))
+ ignore = true;
+ break;
+ default:
+ aSuffixes.add(line);
+ }
+ }
+ if (ignore) {
+ for (int i = 0; i < aSuffixes.size(); i++) {
+ aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase());
+ }
+ }
+ suffixes = new SuffixStringMatcher(aSuffixes);
+ modeAccept = allow;
+ ignoreCase = ignore;
+ }
+
+ public static void main(String args[]) throws IOException {
+
+ SuffixURLFilter filter;
+ if (args.length >= 1)
+ filter = new SuffixURLFilter(new FileReader(args[0]));
+ else {
+ filter = new SuffixURLFilter();
+ filter.setConf(NutchConfiguration.create());
+ }
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while ((line = in.readLine()) != null) {
+ String out = filter.filter(line);
+ if (out != null) {
+ System.out.println("ACCEPTED " + out);
+ } else {
+ System.out.println("REJECTED " + out);
+ }
+ }
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ String pluginName = "urlfilter-suffix";
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+ if (attributeFile != null && attributeFile.trim().equals(""))
+ attributeFile = null;
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ } else {
+ // if (LOG.isWarnEnabled()) {
+ // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+ // plugin "+pluginName);
+ // }
+ }
+
+ String file = conf.get("urlfilter.suffix.file");
+ String stringRules = conf.get("urlfilter.suffix.rules");
+ // attribute "file" takes precedence if defined
+ if (attributeFile != null)
+ file = attributeFile;
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+
+ try {
+ readConfiguration(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public boolean isModeAccept() {
+ return modeAccept;
+ }
+
+ public void setModeAccept(boolean modeAccept) {
+ this.modeAccept = modeAccept;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ public void setIgnoreCase(boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ }
+
+ public void setFilterFromPath(boolean filterFromPath) {
+ this.filterFromPath = filterFromPath;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java
new file mode 100644
index 0000000..0449acc
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to either exclude or include only URLs which match
+ * one of the given (path) suffixes.
+ */
+package org.apache.nutch.urlfilter.suffix;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
new file mode 100644
index 0000000..b09ca2f
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.suffix;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit test for <code>SuffixURLFilter</code>.
+ *
+ * @author Andrzej Bialecki
+ */
+public class TestSuffixURLFilter {
+ private static final String suffixes = "# this is a comment\n" + "\n"
+ + ".gif\n" + ".jpg\n" + ".js\n";
+
+ private static final String[] urls = new String[] {
+ "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF",
+ "http://www.example.com/test.jpg", "http://www.example.com/test.JPG",
+ "http://www.example.com/test.html", "http://www.example.com/test.HTML",
+ "http://www.example.com/test.html?q=abc.js",
+ "http://www.example.com/test.js?foo=bar&baz=bar#12333", };
+
+ private static String[] urlsModeAccept = new String[] { null, urls[1], null,
+ urls[3], urls[4], urls[5], null, urls[7] };
+
+ private static String[] urlsModeReject = new String[] { urls[0], null,
+ urls[2], null, null, null, urls[6], null };
+
+ private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null,
+ null, null, urls[4], urls[5], null, urls[7] };
+
+ private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0],
+ urls[1], urls[2], urls[3], null, null, urls[6], null };
+
+ private static String[] urlsModeAcceptAndPathFilter = new String[] { null,
+ urls[1], null, urls[3], urls[4], urls[5], urls[6], null };
+
+ private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null,
+ urls[1], null, urls[3], urls[4], urls[5], null, urls[7] };
+
+ private SuffixURLFilter filter = null;
+
+ @Before
+ public void setUp() throws IOException {
+ filter = new SuffixURLFilter(new StringReader(suffixes));
+ }
+
+ @Test
+ public void testModeAccept() {
+ filter.setIgnoreCase(false);
+ filter.setModeAccept(true);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeReject() {
+ filter.setIgnoreCase(false);
+ filter.setModeAccept(false);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeAcceptIgnoreCase() {
+ filter.setIgnoreCase(true);
+ filter.setModeAccept(true);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeRejectIgnoreCase() {
+ filter.setIgnoreCase(true);
+ filter.setModeAccept(false);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeAcceptAndNonPathFilter() {
+ filter.setModeAccept(true);
+ filter.setFilterFromPath(false);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter
+ .filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeAcceptAndPathFilter() {
+ filter.setModeAccept(true);
+ filter.setFilterFromPath(true);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter
+ .filter(urls[i]));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/build.xml b/nutch-plugins/urlfilter-validator/build.xml
new file mode 100644
index 0000000..4de9292
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-validator" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/ivy.xml b/nutch-plugins/urlfilter-validator/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/plugin.xml b/nutch-plugins/urlfilter-validator/plugin.xml
new file mode 100644
index 0000000..413b288
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-validator"
+ name="URL Validator"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-validator.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.validator"
+ name="Nutch URL Validatorr"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="URLValidator"
+ class="org.apache.nutch.urlfilter.validator.UrlValidator"/>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/pom.xml b/nutch-plugins/urlfilter-validator/pom.xml
new file mode 100644
index 0000000..9eaf641
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-validator</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-validator</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
new file mode 100644
index 0000000..03fca97
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+
+/**
+ * <p>
+ * Validates URLs.
+ * </p>
+ *
+ * <p>
+ * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
+ * 03/07/02, http://javascript.internet.com. However, this validation now bears
+ * little resemblance to the php original.
+ * </p>
+ *
+ * <pre>
+ * Example of usage:
+ * UrlValidator urlValidator = UrlValidator.get();
+ * if (urlValidator.isValid("ftp://foo.bar.com/")) {
+ * System.out.println("url is valid");
+ * } else {
+ * System.out.println("url is invalid");
+ * }
+ *
+ * prints out "url is valid"
+ * </pre>
+ *
+ * <p>
+ * Based on UrlValidator code from Apache commons-validator.
+ * </p>
+ *
+ * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
+ * Identifiers (URI): Generic Syntax </a>
+ *
+ */
+public class UrlValidator implements URLFilter {
+
+ private static final String ALPHA_CHARS = "a-zA-Z";
+
+ private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
+
+ private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
+
+ private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
+
+ private static final String SCHEME_CHARS = ALPHA_CHARS;
+
+ // Drop numeric, and "+-." for now
+ private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
+
+ private static final String ATOM = VALID_CHARS + '+';
+
+ /**
+ * This expression derived/taken from the BNF for URI (RFC2396).
+ */
+ private static final Pattern URL_PATTERN = Pattern
+ .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+ + "(\\?([^#]*))?(#(.*))?");
+
+ /**
+ * Schema/Protocol (ie. http:, ftp:, file:, etc).
+ */
+ private static final int PARSE_URL_SCHEME = 2;
+
+ /**
+ * Includes hostname/ip and port number.
+ */
+ private static final int PARSE_URL_AUTHORITY = 4;
+
+ private static final int PARSE_URL_PATH = 5;
+
+ private static final int PARSE_URL_QUERY = 7;
+
+ /**
+ * Protocol (ie. http:, ftp:,https:).
+ */
+ private static final Pattern SCHEME_PATTERN = Pattern.compile("^["
+ + SCHEME_CHARS + "]+");
+
+ private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(["
+ + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
+
+ private static final int PARSE_AUTHORITY_HOST_IP = 1;
+
+ private static final int PARSE_AUTHORITY_PORT = 2;
+
+ /**
+ * Should always be empty.
+ */
+ private static final int PARSE_AUTHORITY_EXTRA = 3;
+
+ private static final Pattern PATH_PATTERN = Pattern
+ .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
+
+ private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
+
+ private static final Pattern LEGAL_ASCII_PATTERN = Pattern
+ .compile("^[\\x21-\\x7E]+$");
+
+ private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern
+ .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
+
+ private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM
+ + "(\\." + ATOM + ")*$");
+
+ private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
+
+ private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")");
+
+ private static final Pattern ALPHA_PATTERN = Pattern.compile("^["
+ + ALPHA_CHARS + "]");
+
+ private Configuration conf;
+
+ public String filter(String urlString) {
+ return isValid(urlString) ? urlString : null;
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ /**
+ * <p>
+ * Checks if a field has a valid url address.
+ * </p>
+ *
+ * @param value
+ * The value validation is being performed on. A <code>null</code>
+ * value is considered invalid.
+ * @return true if the url is valid.
+ */
+ private boolean isValid(String value) {
+ if (value == null) {
+ return false;
+ }
+
+ Matcher matchUrlPat = URL_PATTERN.matcher(value);
+ if (!LEGAL_ASCII_PATTERN.matcher(value).matches()) {
+ return false;
+ }
+
+ // Check the whole url address structure
+ if (!matchUrlPat.matches()) {
+ return false;
+ }
+
+ if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
+ return false;
+ }
+
+ if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
+ return false;
+ }
+
+ if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
+ return false;
+ }
+
+ if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Validate scheme. If schemes[] was initialized to a non null, then only
+ * those scheme's are allowed. Note this is slightly different than for the
+ * constructor.
+ *
+ * @param scheme
+ * The scheme to validate. A <code>null</code> value is considered
+ * invalid.
+ * @return true if valid.
+ */
+ private boolean isValidScheme(String scheme) {
+ if (scheme == null) {
+ return false;
+ }
+
+ return SCHEME_PATTERN.matcher(scheme).matches();
+ }
+
+ /**
+ * Returns true if the authority is properly formatted. An authority is the
+ * combination of hostname and port. A <code>null</code> authority value is
+ * considered invalid.
+ *
+ * @param authority
+ * Authority value to validate.
+ * @return true if authority (hostname and port) is valid.
+ */
+ private boolean isValidAuthority(String authority) {
+ if (authority == null) {
+ return false;
+ }
+
+ Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
+ if (!authorityMatcher.matches()) {
+ return false;
+ }
+
+ boolean ipV4Address = false;
+ boolean hostname = false;
+ // check if authority is IP address or hostname
+ String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
+ Matcher matchIPV4Pat = IP_V4_DOMAIN_PATTERN.matcher(hostIP);
+ ipV4Address = matchIPV4Pat.matches();
+
+ if (ipV4Address) {
+ // this is an IP address so check components
+ for (int i = 1; i <= 4; i++) {
+ String ipSegment = matchIPV4Pat.group(i);
+ if (ipSegment == null || ipSegment.length() <= 0) {
+ return false;
+ }
+
+ try {
+ if (Integer.parseInt(ipSegment) > 255) {
+ return false;
+ }
+ } catch (NumberFormatException e) {
+ return false;
+ }
+
+ }
+ } else {
+ // Domain is hostname name
+ hostname = DOMAIN_PATTERN.matcher(hostIP).matches();
+ }
+
+ // rightmost hostname will never start with a digit.
+ if (hostname) {
+ // LOW-TECH FIX FOR VALIDATOR-202
+ // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
+ char[] chars = hostIP.toCharArray();
+ int size = 1;
+ for (int i = 0; i < chars.length; i++) {
+ if (chars[i] == '.') {
+ size++;
+ }
+ }
+ String[] domainSegment = new String[size];
+ int segCount = 0;
+ int segLen = 0;
+ Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP);
+
+ while (atomMatcher.find()) {
+ domainSegment[segCount] = atomMatcher.group();
+ segLen = domainSegment[segCount].length() + 1;
+ hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
+ segCount++;
+ }
+ String topLevel = domainSegment[segCount - 1];
+ if (topLevel.length() < 2 || topLevel.length() > 4) {
+ return false;
+ }
+
+ // First letter of top level must be a alpha
+ if (!ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()) {
+ return false;
+ }
+
+ // Make sure there's a host name preceding the authority.
+ if (segCount < 2) {
+ return false;
+ }
+ }
+
+ if (!hostname && !ipV4Address) {
+ return false;
+ }
+
+ String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
+ if (port != null) {
+ if (!PORT_PATTERN.matcher(port).matches()) {
+ return false;
+ }
+ }
+
+ String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
+ return isBlankOrNull(extra);
+ }
+
+ /**
+ * <p>
+ * Checks if the field isn't null and length of the field is greater than zero
+ * not including whitespace.
+ * </p>
+ *
+ * @param value
+ * The value validation is being performed on.
+ * @return true if blank or null.
+ */
+ private boolean isBlankOrNull(String value) {
+ return ((value == null) || (value.trim().length() == 0));
+ }
+
+ /**
+ * Returns true if the path is valid. A <code>null</code> value is considered
+ * invalid.
+ *
+ * @param path
+ * Path value to validate.
+ * @return true if path is valid.
+ */
+ private boolean isValidPath(String path) {
+ if (path == null) {
+ return false;
+ }
+
+ if (!PATH_PATTERN.matcher(path).matches()) {
+ return false;
+ }
+
+ int slash2Count = countToken("//", path);
+ int slashCount = countToken("/", path);
+ int dot2Count = countToken("..", path);
+
+ return (dot2Count <= 0) || ((slashCount - slash2Count - 1) > dot2Count);
+ }
+
+ /**
+ * Returns true if the query is null or it's a properly formatted query
+ * string.
+ *
+ * @param query
+ * Query value to validate.
+ * @return true if query is valid.
+ */
+ private boolean isValidQuery(String query) {
+ if (query == null) {
+ return true;
+ }
+
+ return QUERY_PATTERN.matcher(query).matches();
+ }
+
+ /**
+ * Returns the number of times the token appears in the target.
+ *
+ * @param token
+ * Token value to be counted.
+ * @param target
+ * Target value to count tokens in.
+ * @return the number of tokens.
+ */
+ private int countToken(String token, String target) {
+ int tokenIndex = 0;
+ int count = 0;
+ while (tokenIndex != -1) {
+ tokenIndex = target.indexOf(token, tokenIndex);
+ if (tokenIndex > -1) {
+ tokenIndex++;
+ count++;
+ }
+ }
+ return count;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html
new file mode 100644
index 0000000..b5ec8a1
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>URL filter plugin that validates given urls.</p>
+<p>This plugin runs a series of tests for the given url to make sure that given
+url is valid and 'fetchable'.</p>
+<p>Note: This plugin should <b>only</b> be used for web-related protocols such
+as http, https and ftp.</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
new file mode 100644
index 0000000..2e6d695
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import org.apache.nutch.urlfilter.validator.UrlValidator;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated.
+ *
+ * @author tejasp
+ *
+ */
+
+public class TestUrlValidator {
+
+ /**
+ * Test method for
+ * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+ * .
+ */
+ @Test
+ public void testFilter() {
+ UrlValidator url_validator = new UrlValidator();
+ Assert.assertNotNull(url_validator);
+
+ Assert.assertNull("Filtering on a null object should return null",
+ url_validator.filter(null));
+ Assert.assertNull("Invalid url: example.com/file[/].html",
+ url_validator.filter("example.com/file[/].html"));
+ Assert.assertNull("Invalid url: http://www.example.com/space here.html",
+ url_validator.filter("http://www.example.com/space here.html"));
+ Assert.assertNull("Invalid url: /main.html",
+ url_validator.filter("/main.html"));
+ Assert.assertNull("Invalid url: www.example.com/main.html",
+ url_validator.filter("www.example.com/main.html"));
+ Assert.assertNull("Invalid url: ftp:www.example.com/main.html",
+ url_validator.filter("ftp:www.example.com/main.html"));
+ Assert.assertNull(
+ "Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
+ url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
+ Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+ url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+
+ Assert.assertNotNull(
+ "Valid url: https://issues.apache.org/jira/NUTCH-1127",
+ url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+ Assert
+ .assertNotNull(
+ "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather",
+ url_validator
+ .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather"));
+ Assert
+ .assertNotNull(
+ "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+ url_validator
+ .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+ Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+ url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/build.xml b/nutch-plugins/urlmeta/build.xml
new file mode 100644
index 0000000..ed8d9c9
--- /dev/null
+++ b/nutch-plugins/urlmeta/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlmeta" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/ivy.xml b/nutch-plugins/urlmeta/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/urlmeta/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/plugin.xml b/nutch-plugins/urlmeta/plugin.xml
new file mode 100644
index 0000000..c31adf6
--- /dev/null
+++ b/nutch-plugins/urlmeta/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlmeta"
+ name="URL Meta Indexing Filter"
+ version="1.0.0"
+ provider-name="sgonyea">
+
+
+ <runtime>
+ <library name="urlmeta.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.urlmeta"
+ name="URL Meta Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="indexer-urlmeta"
+ class="org.apache.nutch.indexer.urlmeta.URLMetaIndexingFilter"/>
+ </extension>
+ <extension id="org.apache.nutch.scoring.urlmeta"
+ name="URL Meta Scoring Filter"
+ point="org.apache.nutch.scoring.ScoringFilter">
+ <implementation id="scoring-urlmeta"
+ class="org.apache.nutch.scoring.urlmeta.URLMetaScoringFilter" />
+ </extension>
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/pom.xml b/nutch-plugins/urlmeta/pom.xml
new file mode 100644
index 0000000..cba0b62
--- /dev/null
+++ b/nutch-plugins/urlmeta/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlmeta</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlmeta</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
new file mode 100644
index 0000000..dc673a2
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.urlmeta;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * This is part of the URL Meta plugin. It is designed to enhance the NUTCH-655
+ * patch, by doing two things: 1. Meta Tags that are supplied with your Crawl
+ * URLs, during injection, will be propagated throughout the outlinks of those
+ * Crawl URLs. 2. When you index your URLs, the meta tags that you specified
+ * with your URLs will be indexed alongside those URLs--and can be directly
+ * queried, assuming you have done everything else correctly.
+ *
+ * The flat-file of URLs you are injecting should, per NUTCH-655, be
+ * tab-delimited in the form of:
+ *
+ * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
+ *
+ * Be aware that if you collide with keywords that are already in use (such as
+ * nutch.score/nutch.fetchInterval) then you are in for some unpredictable
+ * behavior.
+ *
+ * Furthermore, in your nutch-site.xml config, you must specify that this plugin
+ * is to be used (1), as well as what (2) Meta Tags it should actively look for.
+ * This does not mean that you must use these tags for every URL, but it does
+ * mean that you must list _all_ of meta tags that you have specified. If you
+ * want them to be propagated and indexed, that is.
+ *
+ * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
+ * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
+ * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
+ * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
+ * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call
+ * this plugin.
+ *
+ * 2. You must also specify the property "urlmeta.tags", who's values are
+ * comma-delimited <value>key1, key2, key3</value>
+ *
+ * TODO: It may be ideal to offer two separate properties, to specify what gets
+ * indexed versus merely propagated.
+ *
+ */
+public class URLMetaIndexingFilter implements IndexingFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(URLMetaIndexingFilter.class);
+ private static final String CONF_PROPERTY = "urlmeta.tags";
+ private static String[] urlMetaTags;
+ private Configuration conf;
+
+ /**
+ * This will take the metatags that you have listed in your "urlmeta.tags"
+ * property, and looks for them inside the CrawlDatum object. If they exist,
+ * this will add it as an attribute inside the NutchDocument.
+ *
+ * @see IndexingFilter#filter
+ */
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ if (conf != null)
+ this.setConf(conf);
+
+ if (urlMetaTags == null || doc == null)
+ return doc;
+
+ for (String metatag : urlMetaTags) {
+ Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+
+ if (metadata != null)
+ doc.add(metatag, metadata.toString());
+ }
+
+ return doc;
+ }
+
+ /** Boilerplate */
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * handles conf assignment and pulls the value assignment from the
+ * "urlmeta.tags" property
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ if (conf == null)
+ return;
+
+ urlMetaTags = conf.getStrings(CONF_PROPERTY);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html
new file mode 100644
index 0000000..5da5d56
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html
@@ -0,0 +1,12 @@
+<html>
+ <body>
+ <p>
+ URL Meta Tag Indexing Plugin
+ </p>
+ <p>
+ Takes Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property,
+ and inserts them into the document--which is then sent to the Indexer. If you specify these fields in
+ the Nutch's schema (as well as the Indexer's), you can reasonably assume that they will be indexed.
+ </p>
+ </body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
new file mode 100644
index 0000000..3965e42
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.urlmeta;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * For documentation:
+ *
+ * @see URLMetaIndexingFilter
+ */
+public class URLMetaScoringFilter extends Configured implements ScoringFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(URLMetaScoringFilter.class);
+ private static final String CONF_PROPERTY = "urlmeta.tags";
+ private static String[] urlMetaTags;
+ private Configuration conf;
+
+ /**
+ * This will take the metatags that you have listed in your "urlmeta.tags"
+ * property, and looks for them inside the parseData object. If they exist,
+ * this will be propagated into your 'targets' Collection's ["outlinks"]
+ * attributes.
+ *
+ * @see ScoringFilter#distributeScoreToOutlinks
+ */
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ if (urlMetaTags == null || targets == null || parseData == null)
+ return adjust;
+
+ Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator();
+
+ while (targetIterator.hasNext()) {
+ Entry<Text, CrawlDatum> nextTarget = targetIterator.next();
+
+ for (String metatag : urlMetaTags) {
+ String metaFromParse = parseData.getMeta(metatag);
+
+ if (metaFromParse == null)
+ continue;
+
+ nextTarget.getValue().getMetaData()
+ .put(new Text(metatag), new Text(metaFromParse));
+ }
+ }
+ return adjust;
+ }
+
+ /**
+ * Takes the metadata, specified in your "urlmeta.tags" property, from the
+ * datum object and injects it into the content. This is transfered to the
+ * parseData object.
+ *
+ * @see ScoringFilter#passScoreBeforeParsing
+ * @see URLMetaScoringFilter#passScoreAfterParsing
+ */
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
+ if (urlMetaTags == null || content == null || datum == null)
+ return;
+
+ for (String metatag : urlMetaTags) {
+ Text metaFromDatum = (Text) datum.getMetaData().get(new Text(metatag));
+
+ if (metaFromDatum == null)
+ continue;
+
+ content.getMetadata().set(metatag, metaFromDatum.toString());
+ }
+ }
+
+ /**
+ * Takes the metadata, which was lumped inside the content, and replicates it
+ * within your parse data.
+ *
+ * @see URLMetaScoringFilter#passScoreBeforeParsing
+ * @see ScoringFilter#passScoreAfterParsing
+ */
+ public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+ if (urlMetaTags == null || content == null || parse == null)
+ return;
+
+ for (String metatag : urlMetaTags) {
+ String metaFromContent = content.getMetadata().get(metatag);
+
+ if (metaFromContent == null)
+ continue;
+
+ parse.getData().getParseMeta().set(metatag, metaFromContent);
+ }
+ }
+
+ /** Boilerplate */
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
+ return initSort;
+ }
+
+ /** Boilerplate */
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+ return initScore;
+ }
+
+ /** Boilerplate */
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ return;
+ }
+
+ /** Boilerplate */
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ return;
+ }
+
+ /** Boilerplate */
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
+ return;
+ }
+
+ /**
+ * handles conf assignment and pulls the value assignment from the
+ * "urlmeta.tags" property
+ */
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+
+ if (conf == null)
+ return;
+
+ urlMetaTags = conf.getStrings(CONF_PROPERTY);
+ }
+
+ /** Boilerplate */
+ public Configuration getConf() {
+ return conf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html
new file mode 100644
index 0000000..5bba7a8
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html
@@ -0,0 +1,11 @@
+<html>
+ <body>
+ <p>
+ URL Meta Tag Scoring Plugin
+ </p>
+ <p>
+ Propagates Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property,
+ along to their outlinks. This does not actually perform scoring.
+ </p>
+ </body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/build.xml b/nutch-plugins/urlnormalizer-ajax/build.xml
new file mode 100644
index 0000000..e100f8a
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-ajax" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/ivy.xml b/nutch-plugins/urlnormalizer-ajax/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/plugin.xml b/nutch-plugins/urlnormalizer-ajax/plugin.xml
new file mode 100644
index 0000000..ad8c72c
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlnormalizer-ajax"
+ name="AJAX URL Normalizer"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlnormalizer-ajax.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlnormalizer.ajax"
+ name="Nutch AJAX URL Normalizer"
+ point="org.apache.nutch.net.URLNormalizer">
+ <implementation id="AjaxURLNormalizer"
+ class="org.apache.nutch.net.urlnormalizer.ajax.AjaxURLNormalizer"/>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/pom.xml b/nutch-plugins/urlnormalizer-ajax/pom.xml
new file mode 100644
index 0000000..e32d952
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlnormalizer-ajax</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlnormalizer-ajax</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
new file mode 100644
index 0000000..5286f6f
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import java.net.URL;
+import java.net.URI;
+import java.net.URLEncoder;
+import java.net.URLDecoder;
+import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * URLNormalizer capable of dealing with AJAX URL's.
+ *
+ * Use the following regex filter to prevent escaped fragments from being fetched.
+ * ^(.*)\?.*_escaped_fragment_
+ */
+public class AjaxURLNormalizer implements URLNormalizer {
+ public static final Logger LOG = LoggerFactory.getLogger(AjaxURLNormalizer.class);
+
+ public static String AJAX_URL_PART = "#!";
+ public static String ESCAPED_URL_PART = "_escaped_fragment_=";
+
+ private Configuration conf;
+ private Charset utf8;
+
+ /**
+ * Default constructor.
+ */
+ public AjaxURLNormalizer() {
+ utf8 = Charset.forName("UTF-8");
+ }
+
+ /**
+ * Attempts to normalize the input URL string
+ *
+ * @param String urlString
+ * @return String
+ */
+ public String normalize(String urlString, String scope) throws MalformedURLException {
+ LOG.info(scope + " // " + urlString);
+
+ // When indexing, transform _escaped_fragment_ URL's to their #! counterpart
+ if (scope.equals(URLNormalizers.SCOPE_INDEXER) && urlString.contains(ESCAPED_URL_PART)) {
+ return normalizeEscapedFragment(urlString);
+ }
+
+ // Otherwise transform #! URL's to their _escaped_fragment_ counterpart
+ if (urlString.contains(AJAX_URL_PART)) {
+ LOG.info(scope + " // " + normalizeHashedFragment(urlString));
+ return normalizeHashedFragment(urlString);
+ }
+
+ // Nothing to normalize here, return verbatim
+ return urlString;
+ }
+
+ /**
+ * Returns a normalized input URL. #! querystrings are transformed
+ * to a _escaped_fragment_ form.
+ *
+ * @param String urlString
+ * @return String
+ */
+ protected String normalizeHashedFragment(String urlString) throws MalformedURLException {
+ URL u = new URL(urlString);
+ int pos = urlString.indexOf(AJAX_URL_PART);
+ StringBuilder sb = new StringBuilder(urlString.substring(0, pos));
+
+ // Get the escaped fragment
+ String escapedFragment = escape(urlString.substring(pos + AJAX_URL_PART.length()));
+
+ // Check if we already have a query in the URL
+ if (u.getQuery() == null) {
+ sb.append("?");
+ } else {
+ sb.append("&");
+ }
+
+ // Append the escaped fragment key and the value
+ sb.append(ESCAPED_URL_PART);
+ sb.append(escapedFragment);
+
+ return sb.toString();
+ }
+
+ /**
+ * Returns a normalized input URL. _escaped_fragment_ querystrings are
+ * transformed to a #! form.
+ *
+ * @param String urlString
+ * @return String
+ */
+ protected String normalizeEscapedFragment(String urlString) throws MalformedURLException {
+ int pos = urlString.indexOf(ESCAPED_URL_PART);
+ URL u = new URL(urlString);
+ StringBuilder sb = new StringBuilder();
+
+ // Write the URL without query string, we'll handle that later
+ sb.append(u.getProtocol());
+ sb.append("://");
+ sb.append(u.getHost());
+ if (u.getPort() != -1) {
+ sb.append(":");
+ sb.append(u.getPort());
+ }
+ sb.append(u.getPath());
+
+ // Get the query string
+ String queryString = u.getQuery();
+
+ // Check if there's an & in the query string
+ int ampPos = queryString.indexOf("&");
+ String keyValuePair = null;
+
+ // If there's none, then the escaped fragment is the only k/v pair
+ if (ampPos == -1) {
+ keyValuePair = queryString;
+ queryString = "";
+ } else {
+ // Obtain the escaped k/v pair
+ keyValuePair = queryString.substring(ampPos + 1);
+
+ // Remove the escaped fragment key/value pair from the query string
+ queryString = queryString.replaceFirst("&" + keyValuePair, "");
+ }
+
+ // Remove escapedUrlPart from the keyValuePair
+ keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, "");
+
+ // Get the fragment escaped
+ String unescapedFragment = unescape(keyValuePair);
+
+ // Append a possible query string, without original escaped fragment
+ if (queryString.length() > 0) {
+ sb.append("?");
+ sb.append(queryString);
+ }
+
+ // Append the fragment delimiter and the unescaped fragment
+ sb.append("#!");
+ sb.append(unescapedFragment);
+
+ return sb.toString();
+ }
+
+ /**
+ * Unescape some exotic characters in the fragment part
+ *
+ * @param String fragmentPart
+ * @return String
+ */
+ protected String unescape(String fragmentPart) {
+ try {
+ fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8");
+ } catch (Exception e) {
+ /// bluh
+ }
+
+ return fragmentPart;
+ }
+
+ /**
+ * Escape some exotic characters in the fragment part
+ *
+ * @param String fragmentPart
+ * @return String
+ */
+ protected String escape(String fragmentPart) {
+ String hex = null;
+ StringBuilder sb = new StringBuilder(fragmentPart.length());
+
+ for (byte b : fragmentPart.getBytes(utf8)) {
+ if (b < 33) {
+ sb.append('%');
+
+ hex = Integer.toHexString(b & 0xFF).toUpperCase();
+
+ // Prevent odd # chars
+ if (hex.length() % 2 != 0) {
+ sb.append('0');
+ }
+ sb.append(hex);
+ } else if (b == 35) {
+ sb.append("%23");
+ } else if (b == 37) {
+ sb.append("%25");
+ } else if (b == 38) {
+ sb.append("%26");
+ } else if (b == 43) {
+ sb.append("%2B");
+ } else {
+ sb.append((char)b);
+ }
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * @param Configuration conf
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ /**
+ * @return Configuration
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+}
\ No newline at end of file