You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:11 UTC
[27/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
new file mode 100644
index 0000000..13064eb
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLFilter;
+import org.apache.xerces.util.DOMUtil;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * SubCollection represents a subset of index, you can define url patterns that
+ * will indicate that particular page (url) is part of SubCollection.
+ */
+public class Subcollection extends Configured implements URLFilter {
+
+ public static final String TAG_COLLECTIONS = "subcollections";
+ public static final String TAG_COLLECTION = "subcollection";
+ public static final String TAG_WHITELIST = "whitelist";
+ public static final String TAG_BLACKLIST = "blacklist";
+ public static final String TAG_NAME = "name";
+ public static final String TAG_KEY = "key";
+ public static final String TAG_ID = "id";
+
+ List<String> blackList = new ArrayList<String>();
+ List<String> whiteList = new ArrayList<String>();
+
+ /**
+ * SubCollection identifier
+ */
+ String id;
+
+ /**
+ * SubCollection key
+ */
+ String key;
+
+ /**
+ * SubCollection name
+ */
+ String name;
+
+ /**
+ * SubCollection whitelist as String
+ */
+ String wlString;
+
+ /**
+ * SubCollection blacklist as String
+ */
+ String blString;
+
+ /**
+ * public Constructor
+ *
+ * @param id
+ * id of SubCollection
+ * @param name
+ * name of SubCollection
+ */
+ public Subcollection(String id, String name, Configuration conf) {
+ this(id, name, null, conf);
+ }
+
+ /**
+ * public Constructor
+ *
+ * @param id
+ * id of SubCollection
+ * @param name
+ * name of SubCollection
+ */
+ public Subcollection(String id, String name, String key, Configuration conf) {
+ this(conf);
+ this.id = id;
+ this.key = key;
+ this.name = name;
+ }
+
+ public Subcollection(Configuration conf) {
+ super(conf);
+ }
+
+ /**
+ * @return Returns the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @return Returns the key
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ * @return Returns the id
+ */
+ public String getId() {
+ return id;
+ }
+
+ /**
+ * Returns whitelist
+ *
+ * @return Whitelist entries
+ */
+ public List<String> getWhiteList() {
+ return whiteList;
+ }
+
+ /**
+ * Returns whitelist String
+ *
+ * @return Whitelist String
+ */
+ public String getWhiteListString() {
+ return wlString;
+ }
+
+ /**
+ * Returns blacklist String
+ *
+ * @return Blacklist String
+ */
+ public String getBlackListString() {
+ return blString;
+ }
+
+ /**
+ * @param whiteList
+ * The whiteList to set.
+ */
+ public void setWhiteList(ArrayList<String> whiteList) {
+ this.whiteList = whiteList;
+ }
+
+ /**
+ * Simple "indexOf" currentFilter for matching patterns.
+ *
+ * <pre>
+ * rules for evaluation are as follows:
+ * 1. if pattern matches in blacklist then url is rejected
+ * 2. if pattern matches in whitelist then url is allowed
+ * 3. url is rejected
+ * </pre>
+ *
+ * @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
+ */
+ public String filter(String urlString) {
+ // first the blacklist
+ Iterator<String> i = blackList.iterator();
+ while (i.hasNext()) {
+ String row = (String) i.next();
+ if (urlString.contains(row))
+ return null;
+ }
+
+ // then whitelist
+ i = whiteList.iterator();
+ while (i.hasNext()) {
+ String row = (String) i.next();
+ if (urlString.contains(row))
+ return urlString;
+ }
+ return null;
+ }
+
+ /**
+ * Initialize Subcollection from dom element
+ *
+ * @param collection
+ */
+ public void initialize(Element collection) {
+ this.id = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_ID).item(0)).trim();
+ this.name = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_NAME).item(0)).trim();
+ this.wlString = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
+
+ parseList(this.whiteList, wlString);
+
+ // Check if there's a blacklist we need to parse
+ NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST);
+ if (nodeList.getLength() > 0) {
+ this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
+ parseList(this.blackList, blString);
+ }
+
+ // Check if there's a key element or set default name
+ nodeList = collection.getElementsByTagName(TAG_KEY);
+ if (nodeList.getLength() == 1) {
+ this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
+ }
+ }
+
+ /**
+ * Create a list of patterns from chunk of text, patterns are separated with
+ * newline
+ *
+ * @param list
+ * @param text
+ */
+ protected void parseList(List<String> list, String text) {
+ list.clear();
+
+ StringTokenizer st = new StringTokenizer(text, "\n\r");
+
+ while (st.hasMoreElements()) {
+ String line = (String) st.nextElement();
+ list.add(line.trim());
+ }
+ }
+
+ /**
+ * Set contents of blacklist from String
+ *
+ * @param list
+ * the blacklist contents
+ */
+ public void setBlackList(String list) {
+ this.blString = list;
+ parseList(blackList, list);
+ }
+
+ /**
+ * Set contents of whitelist from String
+ *
+ * @param list
+ * the whitelist contents
+ */
+ public void setWhiteList(String list) {
+ this.wlString = list;
+ parseList(whiteList, list);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
new file mode 100644
index 0000000..be08d1c
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
@@ -0,0 +1,36 @@
+<html>
+<body>
+<p>
+Subcollection is a subset of an index. Subcollections are defined
+by urlpatterns in form of white/blacklist. So to get the page into
+subcollection it must match the whitelist and not the blacklist.
+</p>
+<p>
+Subcollection definitions are read from a file subcollections.xml
+and the format is as follows (imagine here that you are crawling all
+the virtualhosts from apache.org and you wan't to tag pages with
+url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/
+to be part of subcollection "nutch", this allows you to later search
+specifically from this subcollection)
+</p>
+<p/>
+<p/>
+<pre>
+<?xml version="1.0" encoding="UTF-8"?>
+<subcollections>
+ <subcollection>
+ <name>nutch</name>
+ <id>lucene</id>
+ <whitelist>http://lucene.apache.org/nutch</whitelist>
+ <whitelist>http://wiki.apache.org/nutch/</whitelist>
+ <blacklist />
+ </subcollection>
+</subcollections>
+</pre>
+</p>
+<p>Despite of this configuration you still can crawl any urls
+as long as they pass through your global url filters. (note that
+you must also seed your urls in normal nutch way)
+</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
new file mode 100644
index 0000000..2946d9e
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.collection.Subcollection;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+public class SubcollectionIndexingFilter extends Configured implements
+ IndexingFilter {
+
+ private Configuration conf;
+
+ public SubcollectionIndexingFilter() {
+ super(NutchConfiguration.create());
+ }
+
+ public SubcollectionIndexingFilter(Configuration conf) {
+ super(conf);
+ }
+
+ /**
+ * @param Configuration
+ * conf
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+ }
+
+ /**
+ * @return Configuration
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Doc field name
+ */
+ public static String fieldName = "subcollection";
+
+ /**
+ * Logger
+ */
+ public static final Logger LOG = LoggerFactory
+ .getLogger(SubcollectionIndexingFilter.class);
+
+ /**
+ * "Mark" document to be a part of subcollection
+ *
+ * @param doc
+ * @param url
+ */
+ private void addSubCollectionField(NutchDocument doc, String url) {
+ for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
+ .getSubCollections(url)) {
+ if (coll.getKey() == null) {
+ doc.add(fieldName, coll.getName());
+ } else {
+ doc.add(coll.getKey(), coll.getName());
+ }
+ }
+ }
+
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ String sUrl = url.toString();
+ addSubCollectionField(doc, sUrl);
+ return doc;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
new file mode 100644
index 0000000..1c6ba72
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to assign documents to subcollections.
+ * The field "subcollection" is added and filled with a collection name
+ * defined in a configuration file and selected by pattern, see
+ * {@link org.apache.nutch.collection}.
+ */
+package org.apache.nutch.indexer.subcollection;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java b/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
new file mode 100644
index 0000000..a2d2772
--- /dev/null
+++ b/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Collection;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestSubcollection {
+
+ /**
+ * Test filtering logic
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testFilter() throws Exception {
+ Subcollection sc = new Subcollection(NutchConfiguration.create());
+ sc.setWhiteList("www.nutch.org\nwww.apache.org");
+ sc.setBlackList("jpg\nwww.apache.org/zecret/");
+
+ // matches whitelist
+ Assert.assertEquals("http://www.apache.org/index.html",
+ sc.filter("http://www.apache.org/index.html"));
+
+ // matches blacklist
+ Assert.assertEquals(null,
+ sc.filter("http://www.apache.org/zecret/index.html"));
+ Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
+
+ // no match
+ Assert.assertEquals(null, sc.filter("http://www.google.com/"));
+ }
+
+ @Test
+ public void testInput() {
+ StringBuffer xml = new StringBuffer();
+ xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+ xml.append("<!-- just a comment -->");
+ xml.append("<subcollections>");
+ xml.append("<subcollection>");
+ xml.append("<name>nutch collection</name>");
+ xml.append("<id>nutch</id>");
+ xml.append("<whitelist>");
+ xml.append("http://lucene.apache.org/nutch/\n");
+ xml.append("http://wiki.apache.org/nutch/\n");
+ xml.append("</whitelist>");
+ xml.append("<blacklist>");
+ xml.append("http://www.xxx.yyy\n");
+ xml.append("</blacklist>");
+ xml.append("</subcollection>");
+ xml.append("</subcollections>");
+
+ InputStream is = new ByteArrayInputStream(xml.toString().getBytes());
+
+ CollectionManager cm = new CollectionManager();
+ cm.parse(is);
+
+ Collection<?> c = cm.getAll();
+
+ // test that size matches
+ Assert.assertEquals(1, c.size());
+
+ Subcollection collection = (Subcollection) c.toArray()[0];
+
+ // test collection id
+ Assert.assertEquals("nutch", collection.getId());
+
+ // test collection name
+ Assert.assertEquals("nutch collection", collection.getName());
+
+ // test whitelist
+ Assert.assertEquals(2, collection.whiteList.size());
+
+ String wlUrl = (String) collection.whiteList.get(0);
+ Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl);
+
+ wlUrl = (String) collection.whiteList.get(1);
+ Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl);
+
+ // matches whitelist
+ Assert.assertEquals("http://lucene.apache.org/nutch/",
+ collection.filter("http://lucene.apache.org/nutch/"));
+
+ // test blacklist
+ Assert.assertEquals(1, collection.blackList.size());
+
+ String blUrl = (String) collection.blackList.get(0);
+ Assert.assertEquals("http://www.xxx.yyy", blUrl);
+
+ // no match
+ Assert.assertEquals(null, collection.filter("http://www.google.com/"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/build.xml b/nutch-plugins/tld/build.xml
new file mode 100644
index 0000000..f46c8e6
--- /dev/null
+++ b/nutch-plugins/tld/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="tld" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/ivy.xml b/nutch-plugins/tld/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/tld/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/plugin.xml b/nutch-plugins/tld/plugin.xml
new file mode 100644
index 0000000..712a34a
--- /dev/null
+++ b/nutch-plugins/tld/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="tld"
+ name="Top Level Domain Plugin"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="tld.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.tld"
+ name="Top Level Domain Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="TLDIndexingFilter"
+ class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/>
+ </extension>
+
+ <extension id="org.apache.nutch.scoring.tld"
+ name="Top Level Domain Scoring Filter"
+ point="org.apache.nutch.scoring.ScoringFilter">
+
+ <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter"
+ class="org.apache.nutch.scoring.tld.TLDScoringFilter" />
+ </extension>
+
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/pom.xml b/nutch-plugins/tld/pom.xml
new file mode 100644
index 0000000..95039bd
--- /dev/null
+++ b/nutch-plugins/tld/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>tld</artifactId>
+ <packaging>jar</packaging>
+
+ <name>tld</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
new file mode 100644
index 0000000..cd7e194
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.tld;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * Adds the Top level domain extensions to the index
+ *
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ */
+public class TLDIndexingFilter implements IndexingFilter {
+ public static final Logger LOG = LoggerFactory
+ .getLogger(TLDIndexingFilter.class);
+
+ private Configuration conf;
+
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ try {
+ URL url = new URL(urlText.toString());
+ DomainSuffix d = URLUtil.getDomainSuffix(url);
+
+ doc.add("tld", d.getDomain());
+
+ } catch (Exception ex) {
+ LOG.warn(ex.toString());
+ }
+
+ return doc;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
new file mode 100644
index 0000000..75841d9
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Indexing plugin.</p><p></p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
new file mode 100644
index 0000000..b7f4963
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.tld;
+
+import java.util.List;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/**
+ * Scoring filter to boost tlds.
+ *
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ */
+public class TLDScoringFilter implements ScoringFilter {
+
+ private Configuration conf;
+ private DomainSuffixes tldEntries;
+
+ public TLDScoringFilter() {
+ tldEntries = DomainSuffixes.getInstance();
+ }
+
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+
+ NutchField tlds = doc.getField("tld");
+ float boost = 1.0f;
+
+ if (tlds != null) {
+ for (Object tld : tlds.getValues()) {
+ DomainSuffix entry = tldEntries.get(tld.toString());
+ if (entry != null)
+ boost *= entry.getBoost();
+ }
+ }
+ return initScore * boost;
+ }
+
+ public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
+ ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
+ int validCount) throws ScoringFilterException {
+ return adjust;
+ }
+
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
+ return initSort;
+ }
+
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
+ }
+
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+ throws ScoringFilterException {
+ }
+
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ return adjust;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
new file mode 100644
index 0000000..d05e4b8
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Scoring plugin.</p><p></p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/build.xml b/nutch-plugins/urlfilter-automaton/build.xml
new file mode 100644
index 0000000..78557fc
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-automaton" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-regex-filter/*.jar" />
+ </fileset>
+ <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+ </path>
+
+ <!-- Compile test classes for dependencies -->
+ <target name="deps-test-compile">
+ <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/ivy.xml b/nutch-plugins/urlfilter-automaton/ivy.xml
new file mode 100644
index 0000000..7c1968f
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="dk.brics.automaton" name="automaton" rev="1.11-8" conf="*->default" />
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/plugin.xml b/nutch-plugins/urlfilter-automaton/plugin.xml
new file mode 100644
index 0000000..d0cc1ef
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-automaton"
+ name="Automaton URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-automaton.jar">
+ <export name="*"/>
+ </library>
+ <library name="automaton-1.11-8.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-regex-filter"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.automaton"
+ name="Nutch Automaton URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="AutomatonURLFilter"
+ class="org.apache.nutch.urlfilter.automaton.AutomatonURLFilter"/>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/pom.xml b/nutch-plugins/urlfilter-automaton/pom.xml
new file mode 100644
index 0000000..14a2d07
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/pom.xml
@@ -0,0 +1,50 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-automaton</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-automaton</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>dk.brics.automaton</groupId>
+ <artifactId>automaton</artifactId>
+ <version>1.11-8</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>lib-regex-filter</artifactId>
+ <version>${project.parent.version}</version>
+ </dependency>
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules
new file mode 100644
index 0000000..a2f6da0
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# skip .fr .org and .net domains
+-.*//.*\.fr/.*
+-.*//.*\.org/.*
+-.*//.*\.net/.*
+
+# skip everything else
++.*
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules
new file mode 100644
index 0000000..8966183
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules
@@ -0,0 +1,24 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept hosts in MY.DOMAIN.NAME
++http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.*
+
+# skip everything else
+-.*
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls
new file mode 100644
index 0000000..b1ad9b7
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules
new file mode 100644
index 0000000..dfae8b0
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules
@@ -0,0 +1,19 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept anything else
++.*
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls
new file mode 100644
index 0000000..d3b1bf3
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
++http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
++http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
new file mode 100644
index 0000000..ae4896d
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Automaton imports
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+/**
+ * RegexURLFilterBase implementation based on the <a
+ * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+ * Automata for Java<sup>TM</sup>.
+ *
+ * @author Jérôme Charron
+ * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ */
+public class AutomatonURLFilter extends RegexURLFilterBase {
+ public static final String URLFILTER_AUTOMATON_FILE = "urlfilter.automaton.file";
+ public static final String URLFILTER_AUTOMATON_RULES = "urlfilter.automaton.rules";
+
+ public AutomatonURLFilter() {
+ super();
+ }
+
+ public AutomatonURLFilter(String filename) throws IOException,
+ PatternSyntaxException {
+ super(filename);
+ }
+
+ AutomatonURLFilter(Reader reader) throws IOException,
+ IllegalArgumentException {
+ super(reader);
+ }
+
+ /*
+ * ----------------------------------- * <implementation:RegexURLFilterBase> *
+ * -----------------------------------
+ */
+
+ /**
+ * Rules specified as a config property will override rules specified as a
+ * config file.
+ */
+ protected Reader getRulesReader(Configuration conf) throws IOException {
+ String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
+ if (stringRules != null) {
+ return new StringReader(stringRules);
+ }
+ String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
+ return conf.getConfResourceAsReader(fileRules);
+ }
+
+ // Inherited Javadoc
+ protected RegexRule createRule(boolean sign, String regex) {
+ return new Rule(sign, regex);
+ }
+
+ protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+ return new Rule(sign, regex, hostOrDomain);
+ }
+
+ /*
+ * ------------------------------------ * </implementation:RegexURLFilterBase>
+ * * ------------------------------------
+ */
+
+ public static void main(String args[]) throws IOException {
+ main(new AutomatonURLFilter(), args);
+ }
+
+ private class Rule extends RegexRule {
+
+ private RunAutomaton automaton;
+
+ Rule(boolean sign, String regex) {
+ super(sign, regex);
+ automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+ }
+
+ Rule(boolean sign, String regex, String hostOrDomain) {
+ super(sign, regex, hostOrDomain);
+ automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+ }
+
+ protected boolean match(String url) {
+ return automaton.run(url);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
new file mode 100644
index 0000000..42533f7
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>
+URL filter plugin based on
+<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+Automata for Java<sup>TM</sup>.
+</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
new file mode 100644
index 0000000..a70a6b6
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>AutomatonURLFilter</code>.
+ *
+ * @author Jérôme Charron
+ */
+public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
+
+ protected URLFilter getURLFilter(Reader rules) {
+ try {
+ return new AutomatonURLFilter(rules);
+ } catch (IOException e) {
+ Assert.fail(e.toString());
+ return null;
+ }
+ }
+
+ @Test
+ public void test() {
+ test("WholeWebCrawling");
+ test("IntranetCrawling");
+ bench(50, "Benchmarks");
+ bench(100, "Benchmarks");
+ bench(200, "Benchmarks");
+ bench(400, "Benchmarks");
+ bench(800, "Benchmarks");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/build.xml b/nutch-plugins/urlfilter-domain/build.xml
new file mode 100644
index 0000000..4af55ac
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domain" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="data" />
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/data/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/data/hosts.txt b/nutch-plugins/urlfilter-domain/data/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/data/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/ivy.xml b/nutch-plugins/urlfilter-domain/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/plugin.xml b/nutch-plugins/urlfilter-domain/plugin.xml
new file mode 100644
index 0000000..1452d58
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-domain"
+ name="Domain URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-domain.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.domain"
+ name="Nutch Domain URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="DomainURLFilter"
+ class="org.apache.nutch.urlfilter.domain.DomainURLFilter">
+ <parameter name="file" value="domain-urlfilter.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/pom.xml b/nutch-plugins/urlfilter-domain/pom.xml
new file mode 100644
index 0000000..0c9dddd
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>urlfilter-domain</artifactId>
+ <packaging>jar</packaging>
+
+ <name>urlfilter-domain</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
new file mode 100644
index 0000000..821d944
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * present in the file is allowed.
+ * </p>
+ *
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ *
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ *
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ *
+ * The domain file defaults to domain-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ *
+ * <ul>
+ * <ol>
+ * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ *
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainURLFilter implements URLFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainURLFilter.class);
+
+ // read in attribute "file" of this plugin.
+ private static String attributeFile = null;
+ private Configuration conf;
+ private String domainFile = null;
+ private Set<String> domainSet = new LinkedHashSet<String>();
+
+ private void readConfiguration(Reader configReader) throws IOException {
+
+ // read the configuration file, line by line
+ BufferedReader reader = new BufferedReader(configReader);
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ // add non-blank lines and non-commented lines
+ domainSet.add(StringUtils.lowerCase(line.trim()));
+ }
+ }
+ }
+
+ /**
+ * Default constructor.
+ */
+ public DomainURLFilter() {
+
+ }
+
+ /**
+ * Constructor that specifies the domain file to use.
+ *
+ * @param domainFile
+ * The domain file, overrides domain-urlfilter.text default.
+ *
+ * @throws IOException
+ */
+ public DomainURLFilter(String domainFile) {
+ this.domainFile = domainFile;
+ }
+
+ /**
+ * Sets the configuration.
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // get the extensions for domain urlfilter
+ String pluginName = "urlfilter-domain";
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+
+ // handle blank non empty input
+ if (attributeFile != null && attributeFile.trim().equals("")) {
+ attributeFile = null;
+ }
+
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ } else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ + pluginName);
+ }
+ }
+
+ // domain file and attribute "file" take precedence if defined
+ String file = conf.get("urlfilter.domain.file");
+ String stringRules = conf.get("urlfilter.domain.rules");
+ if (domainFile != null) {
+ file = domainFile;
+ } else if (attributeFile != null) {
+ file = attributeFile;
+ }
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+ try {
+ if (reader == null) {
+ reader = new FileReader(file);
+ }
+ readConfiguration(reader);
+ } catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public String filter(String url) {
+ // https://issues.apache.org/jira/browse/NUTCH-2189
+ if (domainSet.size() == 0) return url;
+
+ try {
+ // match for suffix, domain, and host in that order. more general will
+ // override more specific
+ String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+ String host = URLUtil.getHost(url);
+ String suffix = null;
+ DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+ if (domainSuffix != null) {
+ suffix = domainSuffix.getDomain();
+ }
+
+ if (domainSet.contains(suffix) || domainSet.contains(domain)
+ || domainSet.contains(host)) {
+ return url;
+ }
+
+ // doesn't match, don't allow
+ return null;
+ } catch (Exception e) {
+
+ // if an error happens, allow the url to pass
+ LOG.error("Could not apply filter on url: " + url + "\n"
+ + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
new file mode 100644
index 0000000..d2eba1f
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to include only URLs which match an element in a given list of
+ * domain suffixes, domain names, and/or host names.
+ * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart
+ * (exclude URLs by host or domain).
+ */
+package org.apache.nutch.urlfilter.domain;
+