You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:05 UTC
[21/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/ivy.xml b/src/plugin/index-anchor/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/index-anchor/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/plugin.xml b/src/plugin/index-anchor/plugin.xml
deleted file mode 100644
index 208594b..0000000
--- a/src/plugin/index-anchor/plugin.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin id="index-anchor" name="Anchor Indexing Filter" version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="index-anchor.jar">
- <export name="*" />
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints" />
- </requires>
-
- <extension id="org.apache.nutch.indexer.anchor"
- name="Nutch Anchor Indexing Filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="AnchorIndexingFilter"
- class="org.apache.nutch.indexer.anchor.AnchorIndexingFilter" />
- </extension>
-
-</plugin>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
deleted file mode 100644
index 6c9b834..0000000
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.anchor;
-
-import java.util.HashSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Indexing filter that offers an option to either index all inbound anchor text
- * for a document or deduplicate anchors. Deduplication does have it's con's,
- *
- * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
- */
-public class AnchorIndexingFilter implements IndexingFilter {
-
- public static final Logger LOG = LoggerFactory
- .getLogger(AnchorIndexingFilter.class);
- private Configuration conf;
- private boolean deduplicate = false;
-
- /**
- * Set the {@link Configuration} object
- */
- public void setConf(Configuration conf) {
- this.conf = conf;
-
- deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
- LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
- }
-
- /**
- * Get the {@link Configuration} object
- */
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * The {@link AnchorIndexingFilter} filter object which supports boolean
- * configuration settings for the deduplication of anchors. See
- * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
- *
- * @param doc
- * The {@link NutchDocument} object
- * @param parse
- * The relevant {@link Parse} object passing through the filter
- * @param url
- * URL to be filtered for anchor text
- * @param datum
- * The {@link CrawlDatum} entry
- * @param inlinks
- * The {@link Inlinks} containing anchor text
- * @return filtered NutchDocument
- */
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
-
- HashSet<String> set = null;
-
- for (int i = 0; i < anchors.length; i++) {
- if (deduplicate) {
- if (set == null)
- set = new HashSet<String>();
- String lcAnchor = anchors[i].toLowerCase();
-
- // Check if already processed the current anchor
- if (!set.contains(lcAnchor)) {
- doc.add("anchor", anchors[i]);
-
- // Add to map
- set.add(lcAnchor);
- }
- } else {
- doc.add("anchor", anchors[i]);
- }
- }
-
- return doc;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html
deleted file mode 100644
index c255029..0000000
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>An indexing plugin for inbound anchor text.</p><p></p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
deleted file mode 100644
index 08a42f3..0000000
--- a/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.anchor;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
- * deduplication functionality is working
- *
- * @author lewismc
- *
- */
-public class TestAnchorIndexingFilter {
-
- @Test
- public void testDeduplicateAnchor() throws Exception {
- Configuration conf = NutchConfiguration.create();
- conf.setBoolean("anchorIndexingFilter.deduplicate", true);
- AnchorIndexingFilter filter = new AnchorIndexingFilter();
- filter.setConf(conf);
- Assert.assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- ParseImpl parse = new ParseImpl("foo bar", new ParseData());
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://test1.com/", "text1"));
- inlinks.add(new Inlink("http://test2.com/", "text2"));
- inlinks.add(new Inlink("http://test3.com/", "text2"));
- try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
- new CrawlDatum(), inlinks);
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.getMessage());
- }
- Assert.assertNotNull(doc);
- Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
- .contains("anchor"));
- Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
- .getValues().size());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/build.xml b/src/plugin/index-basic/build.xml
deleted file mode 100755
index a834290..0000000
--- a/src/plugin/index-basic/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-basic" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/ivy.xml b/src/plugin/index-basic/ivy.xml
deleted file mode 100644
index 848216e..0000000
--- a/src/plugin/index-basic/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/plugin.xml b/src/plugin/index-basic/plugin.xml
deleted file mode 100755
index c5d784d..0000000
--- a/src/plugin/index-basic/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="index-basic"
- name="Basic Indexing Filter"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="index-basic.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.indexer.basic"
- name="Nutch Basic Indexing Filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="BasicIndexingFilter"
- class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/>
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
deleted file mode 100644
index 8584fa8..0000000
--- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.basic;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.URLUtil;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Date;
-
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Adds basic searchable fields to a document. The fields added are : domain,
- * host, url, content, title, cache, tstamp domain is included depending on
- * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per
- * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a
- * zero-length title is not added) content is truncated as per
- * {@code indexer.max.content.length} in nutch-default.xml.
- */
-public class BasicIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory
- .getLogger(BasicIndexingFilter.class);
-
- private int MAX_TITLE_LENGTH;
- private int MAX_CONTENT_LENGTH;
- private boolean addDomain = false;
- private Configuration conf;
-
- /**
- * The {@link BasicIndexingFilter} filter object which supports few
- * configuration settings for adding basic searchable fields. See
- * {@code indexer.add.domain}, {@code indexer.max.title.length},
- * {@code indexer.max.content.length} in nutch-default.xml.
- *
- * @param doc
- * The {@link NutchDocument} object
- * @param parse
- * The relevant {@link Parse} object passing through the filter
- * @param url
- * URL to be filtered for anchor text
- * @param datum
- * The {@link CrawlDatum} entry
- * @param inlinks
- * The {@link Inlinks} containing anchor text
- * @return filtered NutchDocument
- */
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
- String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
- String urlString = url.toString();
-
- String host = null;
- try {
- URL u;
- if (reprUrlString != null) {
- u = new URL(reprUrlString);
- } else {
- u = new URL(urlString);
- }
-
- if (addDomain) {
- doc.add("domain", URLUtil.getDomainName(u));
- }
-
- host = u.getHost();
- } catch (MalformedURLException e) {
- throw new IndexingException(e);
- }
-
- if (host != null) {
- doc.add("host", host);
- }
-
- doc.add("url", reprUrlString == null ? urlString : reprUrlString);
-
- // content
- String content = parse.getText();
- if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
- content = content.substring(0, MAX_CONTENT_LENGTH);
- }
- doc.add("content", StringUtil.cleanField(content));
-
- // title
- String title = parse.getData().getTitle();
- if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
- // title
- // if
- // needed
- title = title.substring(0, MAX_TITLE_LENGTH);
- }
-
- if (title.length() > 0) {
- // NUTCH-1004 Do not index empty values for title field
- doc.add("title", StringUtil.cleanField(title));
- }
-
- // add cached content/summary display policy, if available
- String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
- if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
- doc.add("cache", caching);
- }
-
- // add timestamp when fetched, for deduplication
- doc.add("tstamp", new Date(datum.getFetchTime()));
-
- return doc;
- }
-
- /**
- * Set the {@link Configuration} object
- */
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
- this.addDomain = conf.getBoolean("indexer.add.domain", false);
- this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
- }
-
- /**
- * Get the {@link Configuration} object
- */
- public Configuration getConf() {
- return this.conf;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html
deleted file mode 100644
index 3fae405..0000000
--- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
deleted file mode 100644
index 4bc317e..0000000
--- a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.basic;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.basic.BasicIndexingFilter;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.util.Date;
-
-/**
- * JUnit test case which tests 1. that basic searchable fields are added to a
- * document 2. that domain is added as per {@code indexer.add.domain} in
- * nutch-default.xml. 3. that title is truncated as per
- * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
- * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
- *
- * @author tejasp
- *
- */
-
-public class TestBasicIndexingFilter {
-
- @Test
- public void testBasicIndexingFilter() throws Exception {
- Configuration conf = NutchConfiguration.create();
- conf.setInt("indexer.max.title.length", 10);
- conf.setBoolean("indexer.add.domain", true);
- conf.setInt("indexer.max.content.length", 20);
-
- BasicIndexingFilter filter = new BasicIndexingFilter();
- filter.setConf(conf);
- Assert.assertNotNull(filter);
-
- NutchDocument doc = new NutchDocument();
-
- String title = "The Foo Page";
- Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
- Metadata metaData = new Metadata();
- metaData.add("Language", "en/us");
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
- outlinks, metaData);
- ParseImpl parse = new ParseImpl(
- "this is a sample foo bar page. hope you enjoy it.", parseData);
-
- CrawlDatum crawlDatum = new CrawlDatum();
- crawlDatum.setFetchTime(100L);
-
- Inlinks inlinks = new Inlinks();
-
- try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
- crawlDatum, inlinks);
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.getMessage());
- }
- Assert.assertNotNull(doc);
- Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
- .getField("title").getValues().get(0));
- Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
- .getField("domain").getValues().get(0));
- Assert.assertEquals("test host, expect \"nutch.apache.org\"",
- "nutch.apache.org", doc.getField("host").getValues().get(0));
- Assert.assertEquals(
- "test url, expect \"http://nutch.apache.org/index.html\"",
- "http://nutch.apache.org/index.html", doc.getField("url").getValues()
- .get(0));
- Assert.assertEquals("test content", "this is a sample foo",
- doc.getField("content").getValues().get(0));
- Assert.assertEquals("test fetch time", new Date(100L),
- (Date) doc.getField("tstamp").getValues().get(0));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/build-ivy.xml b/src/plugin/index-geoip/build-ivy.xml
deleted file mode 100644
index 2cda7e9..0000000
--- a/src/plugin/index-geoip/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
- <property name="ivy.install.version" value="2.1.0" />
- <condition property="ivy.home" value="${env.IVY_HOME}">
- <isset property="env.IVY_HOME" />
- </condition>
- <property name="ivy.home" value="${user.home}/.ant" />
- <property name="ivy.checksums" value="" />
- <property name="ivy.jar.dir" value="${ivy.home}/lib" />
- <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
- <target name="download-ivy" unless="offline">
-
- <mkdir dir="${ivy.jar.dir}"/>
- <!-- download Ivy from web site so that it can be used even without any special installation -->
- <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
- dest="${ivy.jar.file}" usetimestamp="true"/>
- </target>
-
- <target name="init-ivy" depends="download-ivy">
- <!-- try to load ivy here from ivy home, in case the user has not already dropped
- it into ant's lib dir (note that the latter copy will always take precedence).
- We will not fail as long as local lib dir exists (it may be empty) and
- ivy is in at least one of ant's lib dir or the local lib dir. -->
- <path id="ivy.lib.path">
- <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
- </path>
- <taskdef resource="org/apache/ivy/ant/antlib.xml"
- uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
- </target>
-
- <target name="deps-jar" depends="init-ivy">
- <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/build.xml b/src/plugin/index-geoip/build.xml
deleted file mode 100644
index 92fda82..0000000
--- a/src/plugin/index-geoip/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-geoip" default="jar-core">
-
- <import file="../build-plugin.xml"/>
- <target name="init-plugin">
- <echo>Copying MaxMind GeoIP .mmdb files to build</echo>
- <copy todir="${build.classes}">
- <fileset dir="${src.dir}" includes="**/*.mmdb" />
- </copy>
- </target>
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
deleted file mode 100644
index 1b626f0..0000000
--- a/src/plugin/index-geoip/ivy.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" >
- <!-- Exlude due to classpath issues -->
- <exclude org="org.apache.httpcomponents" name="httpclient" />
- <exclude org="org.apache.httpcomponents" name="httpcore" />
- </dependency>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
deleted file mode 100644
index 214fbd0..0000000
--- a/src/plugin/index-geoip/plugin.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="index-geoip"
- name="GeoIP2 Indexing Filter"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="index-geoip.jar">
- <export name="*"/>
- </library>
- <library name="commons-codec-1.6.jar"/>
- <library name="commons-logging-1.1.1.jar"/>
- <library name="geoip2-2.3.1.jar"/>
- <library name="google-http-client-1.20.0.jar"/>
- <library name="jackson-annotations-2.5.0.jar"/>
- <library name="jackson-core-2.5.3.jar"/>
- <library name="jackson-databind-2.5.3.jar"/>
- <library name="jsr305-1.3.9.jar"/>
- <library name="maxmind-db-1.0.0.jar"/>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.indexer.geoip"
- name="Nutch GeoIP2 Indexing Filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="GeoIPIndexingFilter"
- class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
- </extension>
-
-</plugin>
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
deleted file mode 100644
index 88d78ef..0000000
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.geoip;
-
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-
-import org.apache.nutch.indexer.NutchDocument;
-
-import com.maxmind.geoip2.DatabaseReader;
-import com.maxmind.geoip2.WebServiceClient;
-import com.maxmind.geoip2.exception.GeoIp2Exception;
-import com.maxmind.geoip2.model.InsightsResponse;
-import com.maxmind.geoip2.model.CityResponse;
-import com.maxmind.geoip2.model.ConnectionTypeResponse;
-import com.maxmind.geoip2.model.CountryResponse;
-import com.maxmind.geoip2.model.DomainResponse;
-import com.maxmind.geoip2.model.IspResponse;
-import com.maxmind.geoip2.record.City;
-import com.maxmind.geoip2.record.Continent;
-import com.maxmind.geoip2.record.Country;
-import com.maxmind.geoip2.record.Location;
-import com.maxmind.geoip2.record.Postal;
-import com.maxmind.geoip2.record.RepresentedCountry;
-import com.maxmind.geoip2.record.Subdivision;
-import com.maxmind.geoip2.record.Traits;
-
-/**
- * <p>
- * Simple utility class which enables efficient, structured
- * {@link org.apache.nutch.indexer.NutchDocument} building based on input from
- * {@link GeoIPIndexingFilter}, where configuration is also read.
- * </p>
- * <p>
- * Based on the nature of the input, this class wraps factory type
- * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}
- * 's with the correct {@link org.apache.nutch.indexer.NutchField} information.
- *
- */
-public class GeoIPDocumentCreator {
-
- /**
- * Default constructor.
- */
- public GeoIPDocumentCreator() {
- }
-
- public static NutchDocument createDocFromInsightsService(String serverIp,
- NutchDocument doc, WebServiceClient client) throws UnknownHostException,
- IOException, GeoIp2Exception {
- doc.add("ip", serverIp);
- InsightsResponse response = client
- .insights(InetAddress.getByName(serverIp));
- // CityResponse response = client.city(InetAddress.getByName(serverIp));
-
- City city = response.getCity();
- doc.add("cityName", city.getName()); // 'Minneapolis'
- doc.add("cityConfidence", city.getConfidence()); // 50
- doc.add("cityGeoNameId", city.getGeoNameId());
-
- Continent continent = response.getContinent();
- doc.add("continentCode", continent.getCode());
- doc.add("continentGeoNameId", continent.getGeoNameId());
- doc.add("continentName", continent.getName());
-
- Country country = response.getCountry();
- doc.add("countryIsoCode", country.getIsoCode()); // 'US'
- doc.add("countryName", country.getName()); // 'United States'
- doc.add("countryConfidence", country.getConfidence()); // 99
- doc.add("countryGeoName", country.getGeoNameId());
-
- Location location = response.getLocation();
- doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
- // -93.2323
- doc.add("accRadius", location.getAccuracyRadius()); // 3
- doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
- doc.add("metroCode", location.getMetroCode());
-
- Postal postal = response.getPostal();
- doc.add("postalCode", postal.getCode()); // '55455'
- doc.add("postalConfidence", postal.getConfidence()); // 40
-
- RepresentedCountry rCountry = response.getRepresentedCountry();
- doc.add("countryType", rCountry.getType());
-
- Subdivision subdivision = response.getMostSpecificSubdivision();
- doc.add("subDivName", subdivision.getName()); // 'Minnesota'
- doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
- doc.add("subDivConfidence", subdivision.getConfidence()); // 90
- doc.add("subDivGeoNameId", subdivision.getGeoNameId());
-
- Traits traits = response.getTraits();
- doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
- doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
- doc.add("domain", traits.getDomain());
- doc.add("isp", traits.getIsp());
- doc.add("org", traits.getOrganization());
- doc.add("userType", traits.getUserType());
- doc.add("isAnonProxy", traits.isAnonymousProxy());
- doc.add("isSatelliteProv", traits.isSatelliteProvider());
- return doc;
- }
-
- @SuppressWarnings("unused")
- public static NutchDocument createDocFromCityService(String serverIp,
- NutchDocument doc, WebServiceClient client) throws UnknownHostException,
- IOException, GeoIp2Exception {
- CityResponse response = client.city(InetAddress.getByName(serverIp));
- return doc;
- }
-
- @SuppressWarnings("unused")
- public static NutchDocument createDocFromCountryService(String serverIp,
- NutchDocument doc, WebServiceClient client) throws UnknownHostException,
- IOException, GeoIp2Exception {
- CountryResponse response = client.country(InetAddress.getByName(serverIp));
- return doc;
- }
-
- public static NutchDocument createDocFromIspDb(String serverIp,
- NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
- IOException, GeoIp2Exception {
- IspResponse response = reader.isp(InetAddress.getByName(serverIp));
- doc.add("ip", serverIp);
- doc.add("autonSystemNum", response.getAutonomousSystemNumber());
- doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
- doc.add("isp", response.getIsp());
- doc.add("org", response.getOrganization());
- return doc;
- }
-
- public static NutchDocument createDocFromDomainDb(String serverIp,
- NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
- IOException, GeoIp2Exception {
- DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
- doc.add("ip", serverIp);
- doc.add("domain", response.getDomain());
- return doc;
- }
-
- public static NutchDocument createDocFromConnectionDb(String serverIp,
- NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
- IOException, GeoIp2Exception {
- ConnectionTypeResponse response = reader.connectionType(InetAddress
- .getByName(serverIp));
- doc.add("ip", serverIp);
- doc.add("connType", response.getConnectionType().toString());
- return doc;
- }
-
- public static NutchDocument createDocFromCityDb(String serverIp,
- NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
- IOException, GeoIp2Exception {
- doc.add("ip", serverIp);
- CityResponse response = reader.city(InetAddress.getByName(serverIp));
-
- City city = response.getCity();
- doc.add("cityName", city.getName()); // 'Minneapolis'
- doc.add("cityConfidence", city.getConfidence()); // 50
- doc.add("cityGeoNameId", city.getGeoNameId());
-
- Continent continent = response.getContinent();
- doc.add("continentCode", continent.getCode());
- doc.add("continentGeoNameId", continent.getGeoNameId());
- doc.add("continentName", continent.getName());
-
- Country country = response.getCountry();
- doc.add("countryIsoCode", country.getIsoCode()); // 'US'
- doc.add("countryName", country.getName()); // 'United States'
- doc.add("countryConfidence", country.getConfidence()); // 99
- doc.add("countryGeoName", country.getGeoNameId());
-
- Location location = response.getLocation();
- doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
- // -93.2323
- doc.add("accRadius", location.getAccuracyRadius()); // 3
- doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
- doc.add("metroCode", location.getMetroCode());
-
- Postal postal = response.getPostal();
- doc.add("postalCode", postal.getCode()); // '55455'
- doc.add("postalConfidence", postal.getConfidence()); // 40
-
- RepresentedCountry rCountry = response.getRepresentedCountry();
- doc.add("countryType", rCountry.getType());
-
- Subdivision subdivision = response.getMostSpecificSubdivision();
- doc.add("subDivName", subdivision.getName()); // 'Minnesota'
- doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
- doc.add("subDivConfidence", subdivision.getConfidence()); // 90
- doc.add("subDivGeoNameId", subdivision.getGeoNameId());
- return doc;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
deleted file mode 100644
index f515f1f..0000000
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.geoip;
-
-import java.io.File;
-import java.io.IOException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.maxmind.geoip2.DatabaseReader;
-import com.maxmind.geoip2.WebServiceClient;
-
-/**
- * <p>
- * This plugin implements an indexing filter which takes advantage of the <a
- * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.
- * </p>
- * <p>
- * The third party library distribution provides an API for the GeoIP2 <a
- * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
- * services</a> and <a
- * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The
- * API also works with the free <a
- * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
- * </p>
- * <p>
- * Depending on the service level agreement, you have with the GeoIP service
- * provider, the plugin can add a number of the following fields to the index
- * data model:
- * <ol>
- * <li>Continent</li>
- * <li>Country</li>
- * <li>Regional Subdivision</li>
- * <li>City</li>
- * <li>Postal Code</li>
- * <li>Latitude/Longitude</li>
- * <li>ISP/Organization</li>
- * <li>AS Number</li>
- * <li>Confidence Factors</li>
- * <li>Radius</li>
- * <li>User Type</li>
- * </ol>
- * </p>
- *
- * <p>
- * Some of the services are documented at the <a
- * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision
- * Services</a> webpage where more information can be obtained.
- * </p>
- *
- * <p>
- * You should also consult the following three properties in
- * <code>nutch-site.xml</code>
- * </p>
- *
- * <pre>
- * {@code
- * <!-- index-geoip plugin properties -->
- * <property>
- * <name>index.geoip.usage</name>
- * <value>insightsService</value>
- * <description>
- * A string representing the information source to be used for GeoIP information
- * association. Either enter 'cityDatabase', 'connectionTypeDatabase',
- * 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
- * Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
- * GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath
- * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
- * </description>
- * </property>
- *
- * <property>
- * <name>index.geoip.userid</name>
- * <value></value>
- * <description>
- * The userId associated with the GeoIP2 Precision Services account.
- * </description>
- * </property>
- *
- * <property>
- * <name>index.geoip.licensekey</name>
- * <value></value>
- * <description>
- * The license key associated with the GeoIP2 Precision Services account.
- * </description>
- * </property>
- * }
- * </pre>
- *
- */
-public class GeoIPIndexingFilter implements IndexingFilter {
-
- private static final Logger LOG = LoggerFactory
- .getLogger(GeoIPIndexingFilter.class);
-
- private Configuration conf;
-
- private String usage = null;
-
- private File geoDb = null;
-
- WebServiceClient client = null;
-
- DatabaseReader reader = null;
-
- // private AbstractResponse response = null;
-
- /**
- * Default constructor for this plugin
- */
- public GeoIPIndexingFilter() {
- }
-
- /**
- * @see org.apache.hadoop.conf.Configurable#getConf()
- */
- @Override
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
- */
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- String use = conf.get("index.geoip.usage", "insightsService");
- LOG.debug("GeoIP usage medium set to: {}", use);
- if (use.equalsIgnoreCase("cityDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
- .getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- } else if (use.equalsIgnoreCase("domainDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- } else if (use.equalsIgnoreCase("ispDatabase")) {
- try {
- geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
- buildDb();
- } catch (Exception e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- } else if (use.equalsIgnoreCase("insightsService")) {
- client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
- 12345), conf.get("index.geoip.licensekey")).build();
- }
- usage = use;
- }
-
- private void buildDb() {
- try {
- reader = new DatabaseReader.Builder(geoDb).build();
- } catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
- }
- }
-
- /**
- *
- * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
- * org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
- * org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
- */
- @Override
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
- return addServerGeo(doc, parse.getData(), url.toString());
- }
-
- private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
- String url) {
-
- if (conf.getBoolean("store.ip.address", false) == true) {
- try {
- String serverIp = data.getContentMeta().get("_ip_");
- if (serverIp != null) {
- if (usage.equalsIgnoreCase("cityDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
- reader);
- } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
- reader);
- } else if (usage.equalsIgnoreCase("domainDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
- reader);
- } else if (usage.equalsIgnoreCase("ispDatabase")) {
- doc = GeoIPDocumentCreator
- .createDocFromIspDb(serverIp, doc, reader);
- } else if (usage.equalsIgnoreCase("insightsService")) {
- doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
- doc, client);
- }
- }
- } catch (Exception e) {
- LOG.error(e.getMessage());
- e.printStackTrace();
- }
- }
- return doc;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
deleted file mode 100644
index ba62519..0000000
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * <p>This plugin implements an indexing filter which takes
- * advantage of the
- * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
- * <p>The third party library distribution provides an API for the GeoIP2
- * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a>
- * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>.
- * The API also works with the free
- * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
- *
- */
-package org.apache.nutch.indexer.geoip;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/build.xml b/src/plugin/index-links/build.xml
deleted file mode 100644
index b853ccf..0000000
--- a/src/plugin/index-links/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-links" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml
deleted file mode 100644
index 0a363f7..0000000
--- a/src/plugin/index-links/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/plugin.xml b/src/plugin/index-links/plugin.xml
deleted file mode 100644
index dfdc5d2..0000000
--- a/src/plugin/index-links/plugin.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="index-links"
- name="Index inlinks and outlinks"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="index-links.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter"
- name="Links indexing filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter"
- class="org.apache.nutch.indexer.links.LinksIndexingFilter"/>
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
deleted file mode 100644
index 975df66..0000000
--- a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.links;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.slf4j.LoggerFactory;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
- * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
- *
- * In case that you want to ignore the outlinks that point to the same host
- * as the URL being indexed use the following settings in your configuration
- * file:
- *
- * <property>
- * <name>index.links.outlinks.host.ignore</name>
- * <value>true</value>
- * </property>
- *
- * The same configuration is available for inlinks:
- *
- * <property>
- * <name>index.links.inlinks.host.ignore</name>
- * <value>true</value>
- * </property>
- *
- * To store only the host portion of each inlink URL or outlink URL add the
- * following to your configuration file.
- *
- * <property>
- * <name>index.links.hosts.only</name>
- * <value>false</value>
- * </property>
- *
- */
-public class LinksIndexingFilter implements IndexingFilter {
-
- public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore";
- public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore";
- public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
-
- public final static org.slf4j.Logger LOG = LoggerFactory
- .getLogger(LinksIndexingFilter.class);
-
- private Configuration conf;
- private boolean filterOutlinks;
- private boolean filterInlinks;
- private boolean indexHost;
-
- @Override
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- // Add the outlinks
- Outlink[] outlinks = parse.getData().getOutlinks();
-
- if (outlinks != null) {
- Set<String> hosts = new HashSet<String>();
-
- for (Outlink outlink : outlinks) {
- try {
- String linkUrl = outlink.getToUrl();
- String outHost = new URL(linkUrl).getHost().toLowerCase();
-
- if (indexHost) {
- linkUrl = outHost;
-
- if (hosts.contains(linkUrl))
- continue;
-
- hosts.add(linkUrl);
- }
-
- addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
- filterOutlinks, doc);
- } catch (MalformedURLException e) {
- LOG.error("Malformed URL in {}: {}", url, e.getMessage());
- }
- }
- }
-
- // Add the inlinks
- if (null != inlinks) {
- Iterator<Inlink> iterator = inlinks.iterator();
- Set<String> inlinkHosts = new HashSet<String>();
-
- while (iterator.hasNext()) {
- try {
- Inlink link = iterator.next();
- String linkUrl = link.getFromUrl();
- String inHost = new URL(linkUrl).getHost().toLowerCase();
-
- if (indexHost) {
- linkUrl = inHost;
-
- if (inlinkHosts.contains(linkUrl))
- continue;
-
- inlinkHosts.add(linkUrl);
- }
-
- addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
- filterInlinks, doc);
- } catch (MalformedURLException e) {
- LOG.error("Malformed URL in {}: {}", url, e.getMessage());
- }
- }
- }
-
- return doc;
- }
-
- private void addFilteredLink(String fieldName, String url, String linkUrl,
- String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException {
- if (filter) {
- String host = new URL(url.toString()).getHost().toLowerCase();
-
- if (!host.equalsIgnoreCase(urlHost)) {
- doc.add(fieldName, linkUrl);
- }
- } else {
- doc.add(fieldName, linkUrl);
- }
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
- filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
-
- indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
deleted file mode 100644
index c490d1f..0000000
--- a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.links;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.net.URL;
-import java.util.Iterator;
-
-public class TestLinksIndexingFilter {
-
- Configuration conf = NutchConfiguration.create();
- LinksIndexingFilter filter = new LinksIndexingFilter();
- Metadata metadata = new Metadata();
-
- @Before
- public void setUp() throws Exception {
- metadata.add(Response.CONTENT_TYPE, "text/html");
- }
-
- private Outlink[] generateOutlinks() throws Exception {
- return generateOutlinks(false);
- }
-
- private Outlink[] generateOutlinks(boolean parts) throws Exception {
- Outlink[] outlinks = new Outlink[2];
-
- outlinks[0] = new Outlink("http://www.test.com", "test");
- outlinks[1] = new Outlink("http://www.example.com", "example");
-
- if (parts) {
- outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
- "test");
- outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
- "test");
- }
-
- return outlinks;
- }
-
- @Test
- public void testFilterOutlinks() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
- filter.setConf(conf);
-
- Outlink[] outlinks = generateOutlinks();
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
- Assert.assertEquals("Filter outlinks, allow only those from a different host",
- outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
- }
-
- @Test
- public void testFilterInlinks() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
- filter.setConf(conf);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com", "test"));
- inlinks.add(new Inlink("http://www.example.com", "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
- Assert.assertEquals("Filter inlinks, allow only those from a different host",
- "http://www.test.com", doc.getFieldValue("inlinks"));
- }
-
- @Test
- public void testNoFilterOutlinks() throws Exception {
- filter.setConf(conf);
-
- Outlink[] outlinks = generateOutlinks();
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals("All outlinks must be indexed even those from the same host",
- outlinks.length, doc.getField("outlinks").getValues().size());
- }
-
- @Test
- public void testNoFilterInlinks() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
- filter.setConf(conf);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com", "test"));
- inlinks.add(new Inlink("http://www.example.com", "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- Assert.assertEquals("All inlinks must be indexed even those from the same host",
- inlinks.size(), doc.getField("inlinks").getValues().size());
- }
-
- @Test
- public void testIndexOnlyHostPart() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
- conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
- conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
- filter.setConf(conf);
-
- Outlink[] outlinks = generateOutlinks(true);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
- inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
- inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
- "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- NutchField docOutlinks = doc.getField("outlinks");
-
- Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
- new URL("http://www.test.com").getHost(),
- docOutlinks.getValues().get(0));
-
- Assert.assertEquals(
- "The inlinks coming from the same host must count only once", 1,
- doc.getField("inlinks").getValues().size());
-
- Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
- new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
- }
-
- @Test
- public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
- conf = NutchConfiguration.create();
- conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
- conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-
- Outlink[] outlinks = generateOutlinks(true);
-
- filter.setConf(conf);
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
- Assert.assertEquals(
- "Index only the host portion of the outlinks after filtering",
- new URL("http://www.test.com").getHost(),
- doc.getFieldValue("outlinks"));
- }
-
- @Test
- public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
- conf = NutchConfiguration.create();
- conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-
- filter.setConf(conf);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com", "test"));
- inlinks.add(new Inlink("http://www.example.com", "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
- Assert.assertEquals(
- "Index only the host portion of the inlinks after filtering",
- new URL("http://www.test.com").getHost(),
- doc.getFieldValue("inlinks"));
-
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java b/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
deleted file mode 100644
index aaaedbf..0000000
--- a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse;
-
-import org.junit.Test;
-
-import java.util.HashSet;
-import java.util.Set;
-
-import static org.junit.Assert.*;
-
-public class TestOutlinks {
-
- @Test
- public void testAddSameObject() throws Exception {
- Set<Outlink> set = new HashSet<>();
-
- Outlink o = new Outlink("http://www.example.com", "Example");
- set.add(o);
- set.add(o);
-
- assertEquals("Adding the same Outlink twice", 1, set.size());
- }
-
- @Test
- public void testAddOtherObjectWithSameData() throws Exception {
- Set<Outlink> set = new HashSet<>();
-
- Outlink o = new Outlink("http://www.example.com", "Example");
- Outlink o1 = new Outlink("http://www.example.com", "Example");
-
- assertTrue("The two Outlink objects are the same", o.equals(o1));
-
- set.add(o);
- set.add(o1);
-
- assertEquals("The set should contain only 1 Outlink", 1, set.size());
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/build.xml b/src/plugin/index-metadata/build.xml
deleted file mode 100644
index ad96d11..0000000
--- a/src/plugin/index-metadata/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-metadata" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/ivy.xml b/src/plugin/index-metadata/ivy.xml
deleted file mode 100644
index 24d7606..0000000
--- a/src/plugin/index-metadata/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../../ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/plugin.xml b/src/plugin/index-metadata/plugin.xml
deleted file mode 100644
index 4d4c9a7..0000000
--- a/src/plugin/index-metadata/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="index-metadata"
- name="Index Metadata"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="index-metadata.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
-
- <extension id="org.apache.nutch.indexer.metadata"
- name="Nutch metadata indexer"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="MetadataIndexer"
- class="org.apache.nutch.indexer.metadata.MetadataIndexer"/>
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
deleted file mode 100644
index 78718aa..0000000
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.metadata;
-
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-
-/**
- * Indexer which can be configured to extract metadata from the crawldb, parse
- * metadata or content metadata. You can specify the properties "index.db.md",
- * "index.parse.md" or "index.content.md" who's values are comma-delimited
- * <value>key1,key2,key3</value>.
- */
-public class MetadataIndexer implements IndexingFilter {
- private Configuration conf;
- private String[] dbFieldnames;
- private Map<String, String> parseFieldnames;
- private String[] contentFieldnames;
- private static final String db_CONF_PROPERTY = "index.db.md";
- private static final String parse_CONF_PROPERTY = "index.parse.md";
- private static final String content_CONF_PROPERTY = "index.content.md";
-
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- // just in case
- if (doc == null)
- return doc;
-
- // add the fields from crawldb
- if (dbFieldnames != null) {
- for (String metatag : dbFieldnames) {
- Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
- if (metadata != null)
- doc.add(metatag, metadata.toString());
- }
- }
-
- // add the fields from parsemd
- if (parseFieldnames != null) {
- for (String metatag : parseFieldnames.keySet()) {
- for (String value : parse.getData().getParseMeta().getValues(metatag)) {
- if (value != null)
- doc.add(parseFieldnames.get(metatag), value);
- }
- }
- }
-
- // add the fields from contentmd
- if (contentFieldnames != null) {
- for (String metatag : contentFieldnames) {
- for (String value : parse.getData().getContentMeta().getValues(metatag)) {
- if (value != null)
- doc.add(metatag, value);
- }
- }
- }
-
- return doc;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
- parseFieldnames = new HashMap<String, String>();
- for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
- parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
- }
- contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
-
- // TODO check conflict between field names e.g. could have same label
- // from different sources
-
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}