You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/12 12:33:19 UTC
svn commit: r1349236 - in /nutch/trunk: ./ conf/
src/plugin/urlnormalizer-host/ src/plugin/urlnormalizer-host/data/
src/plugin/urlnormalizer-host/src/ src/plugin/urlnormalizer-host/src/java/
src/plugin/urlnormalizer-host/src/java/org/ src/plugin/urlnor...
Author: markus
Date: Tue Jun 12 10:33:18 2012
New Revision: 1349236
URL: http://svn.apache.org/viewvc?rev=1349236&view=rev
Log:
NUTCH-1319 HostNormalizer plugin
Added:
nutch/trunk/conf/host-urlnormalizer.txt
nutch/trunk/src/plugin/urlnormalizer-host/
nutch/trunk/src/plugin/urlnormalizer-host/build.xml
nutch/trunk/src/plugin/urlnormalizer-host/data/
nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt
nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-host/src/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-host/src/test/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349236&r1=1349235&r2=1349236&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 10:33:18 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1319 HostNormalizer plugin (markus)
+
* NUTCH-1386 Headings filter not to add empty values (markus)
* NUTCH-1356 ParseUtil use ExecutorService instead of manually thread handling (ferdy via markus)
Added: nutch/trunk/conf/host-urlnormalizer.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/host-urlnormalizer.txt?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/conf/host-urlnormalizer.txt (added)
+++ nutch/trunk/conf/host-urlnormalizer.txt Tue Jun 12 10:33:18 2012
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# New line separated list of hosts mapped to their desired targets.
+# wildcard hosts are supported. Format: host target
+
+# Map www.apache.org to apache.org
+www.apache.org apache.org
+
+# Map all example.org subdomains to www.example.org
+*.example.org example.org
Added: nutch/trunk/src/plugin/urlnormalizer-host/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/build.xml?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-host/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-host/build.xml Tue Jun 12 10:33:18 2012
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-host" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="data" />
+ </copy>
+</project>
Added: nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt (added)
+++ nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt Tue Jun 12 10:33:18 2012
@@ -0,0 +1,8 @@
+# Force all sub domains to www.
+*.example.com example.com
+
+# Force no sub domain to www. URL's
+www.example.net example.net
+
+# Force www. sub domain when hitting link without sub domain
+example.org www.example.org
\ No newline at end of file
Added: nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml Tue Jun 12 10:33:18 2012
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml Tue Jun 12 10:33:18 2012
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlnormalizer-host"
+ name="Host URL Normalizer"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlnormalizer-host.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlnormalizer.host"
+ name="Nutch Host URL Normalizer"
+ point="org.apache.nutch.net.URLNormalizer">
+ <implementation id="HostURLNormalizer"
+ class="org.apache.nutch.net.urlnormalizer.host.HostURLNormalizer">
+ <parameter name="file" value="host-urlnormalizer.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
Added: nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java (added)
+++ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java Tue Jun 12 10:33:18 2012
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.host;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * URL normalizer for mapping hosts to their desired form. It takes
+ * a simple text file as source in the format:
+ *
+ * example.org www.example.org
+ *
+ * mapping all URL's of example.org the the www sub-domain. It also
+ * allows for wildcards to be used to map all sub-domains to another
+ * host:
+ *
+ * *.example.org www.example.org
+ */
+public class HostURLNormalizer implements URLNormalizer {
+
+ private Configuration conf;
+
+ private static final Logger LOG = LoggerFactory.getLogger(HostURLNormalizer.class);
+
+ private static String attributeFile = null;
+ private String hostsFile = null;
+ private static final HashMap<String,String> hostsMap = new HashMap<String,String>();
+
+ public HostURLNormalizer() {}
+
+ public HostURLNormalizer(String hostsFile) {
+ this.hostsFile = hostsFile;
+ }
+
+ private synchronized void readConfiguration(Reader configReader) throws IOException {
+ if (hostsMap.size() > 0) {
+ return;
+ }
+
+ BufferedReader reader = new BufferedReader(configReader);
+ String line, host, target;
+ int delimiterIndex;
+
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ line.trim();
+ delimiterIndex = line.indexOf(" ");
+
+ host = line.substring(0, delimiterIndex);
+ target = line.substring(delimiterIndex + 1);
+ hostsMap.put(host, target);
+ }
+ }
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // get the extensions for domain urlfilter
+ String pluginName = "urlnormalizer-host";
+ Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+ URLNormalizer.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+
+ // handle blank non empty input
+ if (attributeFile != null && attributeFile.trim().equals("")) {
+ attributeFile = null;
+ }
+
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ }
+ else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ + pluginName);
+ }
+ }
+
+ // domain file and attribute "file" take precedence if defined
+ String file = conf.get("urlnormalizer.hosts.file");
+ String stringRules = conf.get("urlnormalizer.hosts.rules");
+ if (hostsFile != null) {
+ file = hostsFile;
+ }
+ else if (attributeFile != null) {
+ file = attributeFile;
+ }
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+ try {
+ if (reader == null) {
+ reader = new FileReader(file);
+ }
+ readConfiguration(reader);
+ }
+ catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ public String normalize(String urlString, String scope) throws MalformedURLException {
+ String host = new URL(urlString).getHost();
+
+ // Test static hosts
+ if (hostsMap.containsKey(host)) {
+ return replaceHost(urlString, host, hostsMap.get(host));
+ }
+
+ // Test for wildcard in reverse order
+ String[] hostParts = host.split("\\.");
+
+ // Use a buffer for our host parts
+ StringBuilder hostBuffer = new StringBuilder();
+
+ // This is our temp buffer keeping host parts with a wildcard
+ String wildCardHost = new String();
+
+ // Add the tld to the buffer
+ hostBuffer.append(hostParts[hostParts.length -1]);
+
+ for (int i = hostParts.length - 2; i > 0; i--) {
+ // Prepend another sub domain
+ hostBuffer.insert(0, hostParts[i] + ".");
+
+ // Make a wildcarded sub domain
+ wildCardHost = "*." + hostBuffer.toString();
+
+ // Check if this wildcard sub domain exists
+ if (hostsMap.containsKey(wildCardHost)) {
+ // Replace the original input host with the wildard replaced
+ return replaceHost(urlString, host, hostsMap.get(wildCardHost));
+ }
+ }
+
+ return urlString;
+ }
+
+ protected String replaceHost(String urlString, String host, String target) {
+ int hostIndex = urlString.indexOf(host);
+
+ StringBuilder buffer = new StringBuilder();
+
+ buffer.append(urlString.substring(0, hostIndex));
+ buffer.append(target);
+ buffer.append(urlString.substring(hostIndex + host.length()));
+
+ return buffer.toString();
+ }
+
+}
Added: nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java?rev=1349236&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java (added)
+++ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java Tue Jun 12 10:33:18 2012
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.host;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestHostURLNormalizer extends TestCase {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public void testHostURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String hostsFile = SAMPLES + SEPARATOR + "hosts.txt";
+ HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile);
+ normalizer.setConf(conf);
+
+ // Force www. sub domain when hitting link without sub domain
+ assertEquals("http://www.example.org/page.html", normalizer.normalize("http://example.org/page.html", URLNormalizers.SCOPE_DEFAULT));
+
+ // Force no sub domain to www. URL's
+ assertEquals("http://example.net/path/to/something.html", normalizer.normalize("http://www.example.net/path/to/something.html", URLNormalizers.SCOPE_DEFAULT));
+
+ // Force all sub domains to www.
+ assertEquals("http://example.com/?does=it&still=work", normalizer.normalize("http://example.com/?does=it&still=work", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com/buh", normalizer.normalize("http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com/blaat", normalizer.normalize("http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT));
+ }
+}