You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:50 UTC
[66/69] [abbrv] nutch git commit: Moved test sources to maven
standard directory
Moved test sources to maven standard directory
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/20d28406
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/20d28406
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/20d28406
Branch: refs/heads/NUTCH-2292
Commit: 20d284068bdb918b0eea1a614644c5773bb42a12
Parents: ffa1678
Author: Thamme Gowda <th...@apache.org>
Authored: Tue Jul 5 15:21:52 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Tue Jul 5 15:21:52 2016 -0700
----------------------------------------------------------------------
.../nutch/TestCCParseFilter.java | 73 +++
.../nutch/TestCCParseFilter.java | 73 ---
.../apache/nutch/parse/feed/TestFeedParser.java | 124 +++++
.../apache/nutch/parse/feed/TestFeedParser.java | 124 -----
.../anchor/TestAnchorIndexingFilter.java | 67 +++
.../anchor/TestAnchorIndexingFilter.java | 67 ---
.../indexer/basic/TestBasicIndexingFilter.java | 99 ++++
.../indexer/basic/TestBasicIndexingFilter.java | 99 ----
.../indexer/links/TestLinksIndexingFilter.java | 218 +++++++++
.../org/apache/nutch/parse/TestOutlinks.java | 54 +++
.../indexer/links/TestLinksIndexingFilter.java | 218 ---------
.../org/apache/nutch/parse/TestOutlinks.java | 54 ---
.../indexer/more/TestMoreIndexingFilter.java | 123 +++++
.../indexer/more/TestMoreIndexingFilter.java | 123 -----
.../nutch/indexer/replace/TestIndexReplace.java | 456 +++++++++++++++++++
.../nutch/indexer/replace/TestIndexReplace.java | 456 -------------------
.../staticfield/TestStaticFieldIndexerTest.java | 194 ++++++++
.../staticfield/TestStaticFieldIndexerTest.java | 194 --------
.../analysis/lang/TestHTMLLanguageParser.java | 149 ++++++
.../java/org/apache/nutch/analysis/lang/da.test | 108 +++++
.../java/org/apache/nutch/analysis/lang/de.test | 104 +++++
.../java/org/apache/nutch/analysis/lang/el.test | 109 +++++
.../java/org/apache/nutch/analysis/lang/en.test | 105 +++++
.../java/org/apache/nutch/analysis/lang/es.test | 107 +++++
.../java/org/apache/nutch/analysis/lang/fi.test | 106 +++++
.../java/org/apache/nutch/analysis/lang/fr.test | 105 +++++
.../java/org/apache/nutch/analysis/lang/it.test | 109 +++++
.../java/org/apache/nutch/analysis/lang/nl.test | 105 +++++
.../java/org/apache/nutch/analysis/lang/pt.test | 105 +++++
.../java/org/apache/nutch/analysis/lang/sv.test | 108 +++++
.../nutch/analysis/lang/test-referencial.txt | 10 +
.../analysis/lang/TestHTMLLanguageParser.java | 149 ------
.../test/org/apache/nutch/analysis/lang/da.test | 108 -----
.../test/org/apache/nutch/analysis/lang/de.test | 104 -----
.../test/org/apache/nutch/analysis/lang/el.test | 109 -----
.../test/org/apache/nutch/analysis/lang/en.test | 105 -----
.../test/org/apache/nutch/analysis/lang/es.test | 107 -----
.../test/org/apache/nutch/analysis/lang/fi.test | 106 -----
.../test/org/apache/nutch/analysis/lang/fr.test | 105 -----
.../test/org/apache/nutch/analysis/lang/it.test | 109 -----
.../test/org/apache/nutch/analysis/lang/nl.test | 105 -----
.../test/org/apache/nutch/analysis/lang/pt.test | 105 -----
.../test/org/apache/nutch/analysis/lang/sv.test | 108 -----
.../nutch/analysis/lang/test-referencial.txt | 10 -
.../protocol/http/api/TestRobotRulesParser.java | 123 +++++
.../protocol/http/api/TestRobotRulesParser.java | 123 -----
.../filter/MimeTypeIndexingFilterTest.java | 114 +++++
.../filter/MimeTypeIndexingFilterTest.java | 114 -----
.../apache/nutch/parse/ext/TestExtParser.java | 130 ++++++
.../apache/nutch/parse/ext/TestExtParser.java | 130 ------
.../nutch/parse/html/TestDOMContentUtils.java | 347 ++++++++++++++
.../apache/nutch/parse/html/TestHtmlParser.java | 122 +++++
.../parse/html/TestRobotsMetaProcessor.java | 155 +++++++
.../nutch/parse/html/TestDOMContentUtils.java | 347 --------------
.../apache/nutch/parse/html/TestHtmlParser.java | 122 -----
.../parse/html/TestRobotsMetaProcessor.java | 155 -------
.../nutch/parse/metatags/TestMetatagParser.java | 104 +++++
.../nutch/parse/metatags/TestMetatagParser.java | 104 -----
.../nutch/parse/replace/TestParseReplace.java | 68 +++
.../nutch/parse/replace/TestParseReplace.java | 68 ---
.../apache/nutch/parse/swf/TestSWFParser.java | 94 ++++
.../apache/nutch/parse/swf/TestSWFParser.java | 94 ----
.../apache/nutch/tika/TestDOMContentUtils.java | 337 ++++++++++++++
.../org/apache/nutch/tika/TestFeedParser.java | 121 +++++
.../apache/nutch/tika/TestImageMetadata.java | 67 +++
.../org/apache/nutch/tika/TestMSWordParser.java | 92 ++++
.../org/apache/nutch/tika/TestOOParser.java | 107 +++++
.../org/apache/nutch/tika/TestPdfParser.java | 73 +++
.../org/apache/nutch/tika/TestRTFParser.java | 81 ++++
.../nutch/tika/TestRobotsMetaProcessor.java | 156 +++++++
.../apache/nutch/tika/TestDOMContentUtils.java | 337 --------------
.../org/apache/nutch/tika/TestFeedParser.java | 121 -----
.../apache/nutch/tika/TestImageMetadata.java | 67 ---
.../org/apache/nutch/tika/TestMSWordParser.java | 92 ----
.../org/apache/nutch/tika/TestOOParser.java | 107 -----
.../org/apache/nutch/tika/TestPdfParser.java | 73 ---
.../org/apache/nutch/tika/TestRTFParser.java | 81 ----
.../nutch/tika/TestRobotsMetaProcessor.java | 156 -------
.../apache/nutch/parse/zip/TestZipParser.java | 71 +++
.../apache/nutch/parse/zip/TestZipParser.java | 71 ---
.../parsefilter/regex/TestRegexParseFilter.java | 77 ++++
.../parsefilter/regex/TestRegexParseFilter.java | 77 ----
.../nutch/protocol/file/TestProtocolFile.java | 99 ++++
.../nutch/protocol/file/TestProtocolFile.java | 99 ----
.../nutch/protocol/http/TestProtocolHttp.java | 140 ++++++
.../nutch/protocol/http/TestProtocolHttp.java | 140 ------
.../httpclient/TestProtocolHttpClient.java | 217 +++++++++
.../httpclient/TestProtocolHttpClient.java | 217 ---------
.../nutch/collection/TestSubcollection.java | 112 +++++
.../nutch/collection/TestSubcollection.java | 112 -----
.../automaton/TestAutomatonURLFilter.java | 56 +++
.../automaton/TestAutomatonURLFilter.java | 56 ---
.../urlfilter/domain/TestDomainURLFilter.java | 67 +++
.../urlfilter/domain/TestDomainURLFilter.java | 67 ---
.../TestDomainBlacklistURLFilter.java | 49 ++
.../TestDomainBlacklistURLFilter.java | 49 --
.../urlfilter/prefix/TestPrefixURLFilter.java | 79 ++++
.../urlfilter/prefix/TestPrefixURLFilter.java | 79 ----
.../urlfilter/regex/TestRegexURLFilter.java | 61 +++
.../urlfilter/regex/TestRegexURLFilter.java | 61 ---
.../urlfilter/suffix/TestSuffixURLFilter.java | 123 +++++
.../urlfilter/suffix/TestSuffixURLFilter.java | 123 -----
.../urlfilter/validator/TestUrlValidator.java | 79 ++++
.../urlfilter/validator/TestUrlValidator.java | 79 ----
.../ajax/TestAjaxURLNormalizer.java | 67 +++
.../ajax/TestAjaxURLNormalizer.java | 67 ---
.../basic/TestBasicURLNormalizer.java | 175 +++++++
.../basic/TestBasicURLNormalizer.java | 175 -------
.../host/TestHostURLNormalizer.java | 57 +++
.../host/TestHostURLNormalizer.java | 57 ---
.../pass/TestPassURLNormalizer.java | 45 ++
.../pass/TestPassURLNormalizer.java | 45 --
.../protocol/TestProtocolURLNormalizer.java | 55 +++
.../protocol/TestProtocolURLNormalizer.java | 55 ---
.../TestQuerystringURLNormalizer.java | 49 ++
.../TestQuerystringURLNormalizer.java | 49 --
.../regex/TestRegexURLNormalizer.java | 186 ++++++++
.../regex/TestRegexURLNormalizer.java | 186 --------
.../slash/TestSlashURLNormalizer.java | 73 +++
.../slash/TestSlashURLNormalizer.java | 73 ---
120 files changed, 6966 insertions(+), 6966 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
new file mode 100755
index 0000000..41be9ed
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.*;
+
+public class TestCCParseFilter {
+
+ private static final File testDir = new File(System.getProperty("test.input"));
+
+ @Test
+ public void testPages() throws Exception {
+ pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+ // Tika returns <a> whereas parse-html returns <rel>
+ // check later
+ pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+ // Tika returns <a> whereas parse-html returns <rdf>
+ // check later
+ pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+ }
+
+ public void pageTest(File file, String url, String license, String location,
+ String type) throws Exception {
+
+ String contentType = "text/html";
+ InputStream in = new FileInputStream(file);
+ ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
+ byte[] buffer = new byte[1024];
+ int i;
+ while ((i = in.read(buffer)) != -1) {
+ out.write(buffer, 0, i);
+ }
+ in.close();
+ byte[] bytes = out.toByteArray();
+ Configuration conf = NutchConfiguration.create();
+
+ Content content = new Content(url, url, bytes, contentType, new Metadata(),
+ conf);
+ Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+ Metadata metadata = parse.getData().getParseMeta();
+ Assert.assertEquals(license, metadata.get("License-Url"));
+ Assert.assertEquals(location, metadata.get("License-Location"));
+ Assert.assertEquals(type, metadata.get("Work-Type"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
deleted file mode 100755
index 41be9ed..0000000
--- a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.io.*;
-
-public class TestCCParseFilter {
-
- private static final File testDir = new File(System.getProperty("test.input"));
-
- @Test
- public void testPages() throws Exception {
- pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
- // Tika returns <a> whereas parse-html returns <rel>
- // check later
- pageTest(new File(testDir, "rel.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
- // Tika returns <a> whereas parse-html returns <rdf>
- // check later
- pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
- }
-
- public void pageTest(File file, String url, String license, String location,
- String type) throws Exception {
-
- String contentType = "text/html";
- InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
- byte[] buffer = new byte[1024];
- int i;
- while ((i = in.read(buffer)) != -1) {
- out.write(buffer, 0, i);
- }
- in.close();
- byte[] bytes = out.toByteArray();
- Configuration conf = NutchConfiguration.create();
-
- Content content = new Content(url, url, bytes, contentType, new Metadata(),
- conf);
- Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
- Metadata metadata = parse.getData().getParseMeta();
- Assert.assertEquals(license, metadata.get("License-Url"));
- Assert.assertEquals(location, metadata.get("License-Location"));
- Assert.assertEquals(type, metadata.get("Work-Type"));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
new file mode 100644
index 0000000..36c8739
--- /dev/null
+++ b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.Map;
+
+import org.junit.Assert;
+import org.junit.Test;
+// APACHE imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ *
+ * @author mattmann
+ *
+ * Test Suite for the {@link FeedParser}.
+ *
+ */
+public class TestFeedParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/feed/build.xml during plugin compilation.
+
+ private String[] sampleFiles = { "rsstest.rss" };
+
+ public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+ .getName());
+
+ /**
+ * Calls the {@link FeedParser} on a sample RSS file and checks that there are
+ * 3 {@link ParseResult} entries including the below 2 links:
+ * <ul>
+ * <li>http://www-scf.usc.edu/~mattmann/</li>
+ * <li>http://www.nutch.org</li>
+ * </ul>
+ *
+ *
+ * @throws ProtocolNotFound
+ * If the {@link Protocol}Layer cannot be loaded (required to fetch
+ * the {@link Content} for the RSS file).
+ * @throws ParseException
+ * If the {@link Parser}Layer cannot be loaded.
+ */
+ @Test
+ public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ ParseResult parseResult;
+
+ Configuration conf = NutchConfiguration.create();
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+ urlString = urlString.replace('\\', '/');
+
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+
+ parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
+
+ Assert.assertEquals(3, parseResult.size());
+
+ boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+ for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
+ .hasNext();) {
+ Map.Entry<Text, Parse> entry = j.next();
+ if (entry.getKey().toString()
+ .equals("http://www-scf.usc.edu/~mattmann/")) {
+ hasLink1 = true;
+ } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
+ hasLink2 = true;
+ } else if (entry.getKey().toString().equals(urlString)) {
+ hasLink3 = true;
+ }
+
+ Assert.assertNotNull(entry.getValue());
+ Assert.assertNotNull(entry.getValue().getData());
+ }
+
+ if (!hasLink1 || !hasLink2 || !hasLink3) {
+ Assert.fail("Outlinks read from sample rss file are not correct!");
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
deleted file mode 100644
index 36c8739..0000000
--- a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.feed;
-
-// JDK imports
-import java.util.Iterator;
-import java.util.Map;
-
-import org.junit.Assert;
-import org.junit.Test;
-// APACHE imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolNotFound;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- *
- * @author mattmann
- *
- * Test Suite for the {@link FeedParser}.
- *
- */
-public class TestFeedParser {
-
- private String fileSeparator = System.getProperty("file.separator");
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
-
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/feed/build.xml during plugin compilation.
-
- private String[] sampleFiles = { "rsstest.rss" };
-
- public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
- .getName());
-
- /**
- * Calls the {@link FeedParser} on a sample RSS file and checks that there are
- * 3 {@link ParseResult} entries including the below 2 links:
- * <ul>
- * <li>http://www-scf.usc.edu/~mattmann/</li>
- * <li>http://www.nutch.org</li>
- * </ul>
- *
- *
- * @throws ProtocolNotFound
- * If the {@link Protocol}Layer cannot be loaded (required to fetch
- * the {@link Content} for the RSS file).
- * @throws ParseException
- * If the {@link Parser}Layer cannot be loaded.
- */
- @Test
- public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- ParseResult parseResult;
-
- Configuration conf = NutchConfiguration.create();
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
- urlString = urlString.replace('\\', '/');
-
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
-
- parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
-
- Assert.assertEquals(3, parseResult.size());
-
- boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
-
- for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
- .hasNext();) {
- Map.Entry<Text, Parse> entry = j.next();
- if (entry.getKey().toString()
- .equals("http://www-scf.usc.edu/~mattmann/")) {
- hasLink1 = true;
- } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
- hasLink2 = true;
- } else if (entry.getKey().toString().equals(urlString)) {
- hasLink3 = true;
- }
-
- Assert.assertNotNull(entry.getValue());
- Assert.assertNotNull(entry.getValue().getData());
- }
-
- if (!hasLink1 || !hasLink2 || !hasLink3) {
- Assert.fail("Outlinks read from sample rss file are not correct!");
- }
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
new file mode 100644
index 0000000..08a42f3
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
+ *
+ * @author lewismc
+ *
+ */
+public class TestAnchorIndexingFilter {
+
+ @Test
+ public void testDeduplicateAnchor() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("anchorIndexingFilter.deduplicate", true);
+ AnchorIndexingFilter filter = new AnchorIndexingFilter();
+ filter.setConf(conf);
+ Assert.assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://test1.com/", "text1"));
+ inlinks.add(new Inlink("http://test2.com/", "text2"));
+ inlinks.add(new Inlink("http://test3.com/", "text2"));
+ try {
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+ new CrawlDatum(), inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+ Assert.assertNotNull(doc);
+ Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
+ .contains("anchor"));
+ Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
+ .getValues().size());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
deleted file mode 100644
index 08a42f3..0000000
--- a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.anchor;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
- * deduplication functionality is working
- *
- * @author lewismc
- *
- */
-public class TestAnchorIndexingFilter {
-
- @Test
- public void testDeduplicateAnchor() throws Exception {
- Configuration conf = NutchConfiguration.create();
- conf.setBoolean("anchorIndexingFilter.deduplicate", true);
- AnchorIndexingFilter filter = new AnchorIndexingFilter();
- filter.setConf(conf);
- Assert.assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- ParseImpl parse = new ParseImpl("foo bar", new ParseData());
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://test1.com/", "text1"));
- inlinks.add(new Inlink("http://test2.com/", "text2"));
- inlinks.add(new Inlink("http://test3.com/", "text2"));
- try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
- new CrawlDatum(), inlinks);
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.getMessage());
- }
- Assert.assertNotNull(doc);
- Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
- .contains("anchor"));
- Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
- .getValues().size());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
new file mode 100644
index 0000000..4bc317e
--- /dev/null
+++ b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Date;
+
+/**
+ * JUnit test case which tests 1. that basic searchable fields are added to a
+ * document 2. that domain is added as per {@code indexer.add.domain} in
+ * nutch-default.xml. 3. that title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
+ * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ *
+ * @author tejasp
+ *
+ */
+
+public class TestBasicIndexingFilter {
+
+ @Test
+ public void testBasicIndexingFilter() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.setInt("indexer.max.title.length", 10);
+ conf.setBoolean("indexer.add.domain", true);
+ conf.setInt("indexer.max.content.length", 20);
+
+ BasicIndexingFilter filter = new BasicIndexingFilter();
+ filter.setConf(conf);
+ Assert.assertNotNull(filter);
+
+ NutchDocument doc = new NutchDocument();
+
+ String title = "The Foo Page";
+ Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
+ Metadata metaData = new Metadata();
+ metaData.add("Language", "en/us");
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+ outlinks, metaData);
+ ParseImpl parse = new ParseImpl(
+ "this is a sample foo bar page. hope you enjoy it.", parseData);
+
+ CrawlDatum crawlDatum = new CrawlDatum();
+ crawlDatum.setFetchTime(100L);
+
+ Inlinks inlinks = new Inlinks();
+
+ try {
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+ crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+ Assert.assertNotNull(doc);
+ Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
+ .getField("title").getValues().get(0));
+ Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
+ .getField("domain").getValues().get(0));
+ Assert.assertEquals("test host, expect \"nutch.apache.org\"",
+ "nutch.apache.org", doc.getField("host").getValues().get(0));
+ Assert.assertEquals(
+ "test url, expect \"http://nutch.apache.org/index.html\"",
+ "http://nutch.apache.org/index.html", doc.getField("url").getValues()
+ .get(0));
+ Assert.assertEquals("test content", "this is a sample foo",
+ doc.getField("content").getValues().get(0));
+ Assert.assertEquals("test fetch time", new Date(100L),
+ (Date) doc.getField("tstamp").getValues().get(0));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
deleted file mode 100644
index 4bc317e..0000000
--- a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.basic;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.basic.BasicIndexingFilter;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.util.Date;
-
-/**
- * JUnit test case which tests 1. that basic searchable fields are added to a
- * document 2. that domain is added as per {@code indexer.add.domain} in
- * nutch-default.xml. 3. that title is truncated as per
- * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
- * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
- *
- * @author tejasp
- *
- */
-
-public class TestBasicIndexingFilter {
-
- @Test
- public void testBasicIndexingFilter() throws Exception {
- Configuration conf = NutchConfiguration.create();
- conf.setInt("indexer.max.title.length", 10);
- conf.setBoolean("indexer.add.domain", true);
- conf.setInt("indexer.max.content.length", 20);
-
- BasicIndexingFilter filter = new BasicIndexingFilter();
- filter.setConf(conf);
- Assert.assertNotNull(filter);
-
- NutchDocument doc = new NutchDocument();
-
- String title = "The Foo Page";
- Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
- Metadata metaData = new Metadata();
- metaData.add("Language", "en/us");
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
- outlinks, metaData);
- ParseImpl parse = new ParseImpl(
- "this is a sample foo bar page. hope you enjoy it.", parseData);
-
- CrawlDatum crawlDatum = new CrawlDatum();
- crawlDatum.setFetchTime(100L);
-
- Inlinks inlinks = new Inlinks();
-
- try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
- crawlDatum, inlinks);
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.getMessage());
- }
- Assert.assertNotNull(doc);
- Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
- .getField("title").getValues().get(0));
- Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
- .getField("domain").getValues().get(0));
- Assert.assertEquals("test host, expect \"nutch.apache.org\"",
- "nutch.apache.org", doc.getField("host").getValues().get(0));
- Assert.assertEquals(
- "test url, expect \"http://nutch.apache.org/index.html\"",
- "http://nutch.apache.org/index.html", doc.getField("url").getValues()
- .get(0));
- Assert.assertEquals("test content", "this is a sample foo",
- doc.getField("content").getValues().get(0));
- Assert.assertEquals("test fetch time", new Date(100L),
- (Date) doc.getField("tstamp").getValues().get(0));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
new file mode 100644
index 0000000..c490d1f
--- /dev/null
+++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.net.URL;
+import java.util.Iterator;
+
+public class TestLinksIndexingFilter {
+
+ Configuration conf = NutchConfiguration.create();
+ LinksIndexingFilter filter = new LinksIndexingFilter();
+ Metadata metadata = new Metadata();
+
+ @Before
+ public void setUp() throws Exception {
+ metadata.add(Response.CONTENT_TYPE, "text/html");
+ }
+
+ private Outlink[] generateOutlinks() throws Exception {
+ return generateOutlinks(false);
+ }
+
+ private Outlink[] generateOutlinks(boolean parts) throws Exception {
+ Outlink[] outlinks = new Outlink[2];
+
+ outlinks[0] = new Outlink("http://www.test.com", "test");
+ outlinks[1] = new Outlink("http://www.example.com", "example");
+
+ if (parts) {
+ outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
+ "test");
+ outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
+ "test");
+ }
+
+ return outlinks;
+ }
+
+ @Test
+ public void testFilterOutlinks() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+ filter.setConf(conf);
+
+ Outlink[] outlinks = generateOutlinks();
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+ Assert.assertEquals("Filter outlinks, allow only those from a different host",
+ outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
+ }
+
+ @Test
+ public void testFilterInlinks() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+ filter.setConf(conf);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com", "test"));
+ inlinks.add(new Inlink("http://www.example.com", "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+ Assert.assertEquals("Filter inlinks, allow only those from a different host",
+ "http://www.test.com", doc.getFieldValue("inlinks"));
+ }
+
+ @Test
+ public void testNoFilterOutlinks() throws Exception {
+ filter.setConf(conf);
+
+ Outlink[] outlinks = generateOutlinks();
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals("All outlinks must be indexed even those from the same host",
+ outlinks.length, doc.getField("outlinks").getValues().size());
+ }
+
+ @Test
+ public void testNoFilterInlinks() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
+ filter.setConf(conf);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com", "test"));
+ inlinks.add(new Inlink("http://www.example.com", "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ Assert.assertEquals("All inlinks must be indexed even those from the same host",
+ inlinks.size(), doc.getField("inlinks").getValues().size());
+ }
+
+ @Test
+ public void testIndexOnlyHostPart() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+ conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+ conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+ filter.setConf(conf);
+
+ Outlink[] outlinks = generateOutlinks(true);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
+ inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
+ inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
+ "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ NutchField docOutlinks = doc.getField("outlinks");
+
+ Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
+ new URL("http://www.test.com").getHost(),
+ docOutlinks.getValues().get(0));
+
+ Assert.assertEquals(
+ "The inlinks coming from the same host must count only once", 1,
+ doc.getField("inlinks").getValues().size());
+
+ Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
+ new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
+ }
+
+ @Test
+ public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+ conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+
+ Outlink[] outlinks = generateOutlinks(true);
+
+ filter.setConf(conf);
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+ Assert.assertEquals(
+ "Index only the host portion of the outlinks after filtering",
+ new URL("http://www.test.com").getHost(),
+ doc.getFieldValue("outlinks"));
+ }
+
+ @Test
+ public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+
+ filter.setConf(conf);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com", "test"));
+ inlinks.add(new Inlink("http://www.example.com", "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+ Assert.assertEquals(
+ "Index only the host portion of the inlinks after filtering",
+ new URL("http://www.test.com").getHost(),
+ doc.getFieldValue("inlinks"));
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
new file mode 100644
index 0000000..aaaedbf
--- /dev/null
+++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.*;
+
+public class TestOutlinks {
+
+ @Test
+ public void testAddSameObject() throws Exception {
+ Set<Outlink> set = new HashSet<>();
+
+ Outlink o = new Outlink("http://www.example.com", "Example");
+ set.add(o);
+ set.add(o);
+
+ assertEquals("Adding the same Outlink twice", 1, set.size());
+ }
+
+ @Test
+ public void testAddOtherObjectWithSameData() throws Exception {
+ Set<Outlink> set = new HashSet<>();
+
+ Outlink o = new Outlink("http://www.example.com", "Example");
+ Outlink o1 = new Outlink("http://www.example.com", "Example");
+
+ assertTrue("The two Outlink objects are the same", o.equals(o1));
+
+ set.add(o);
+ set.add(o1);
+
+ assertEquals("The set should contain only 1 Outlink", 1, set.size());
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
deleted file mode 100644
index c490d1f..0000000
--- a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.links;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.net.URL;
-import java.util.Iterator;
-
-public class TestLinksIndexingFilter {
-
- Configuration conf = NutchConfiguration.create();
- LinksIndexingFilter filter = new LinksIndexingFilter();
- Metadata metadata = new Metadata();
-
- @Before
- public void setUp() throws Exception {
- metadata.add(Response.CONTENT_TYPE, "text/html");
- }
-
- private Outlink[] generateOutlinks() throws Exception {
- return generateOutlinks(false);
- }
-
- private Outlink[] generateOutlinks(boolean parts) throws Exception {
- Outlink[] outlinks = new Outlink[2];
-
- outlinks[0] = new Outlink("http://www.test.com", "test");
- outlinks[1] = new Outlink("http://www.example.com", "example");
-
- if (parts) {
- outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
- "test");
- outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
- "test");
- }
-
- return outlinks;
- }
-
- @Test
- public void testFilterOutlinks() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
- filter.setConf(conf);
-
- Outlink[] outlinks = generateOutlinks();
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
- Assert.assertEquals("Filter outlinks, allow only those from a different host",
- outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
- }
-
- @Test
- public void testFilterInlinks() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
- filter.setConf(conf);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com", "test"));
- inlinks.add(new Inlink("http://www.example.com", "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
- Assert.assertEquals("Filter inlinks, allow only those from a different host",
- "http://www.test.com", doc.getFieldValue("inlinks"));
- }
-
- @Test
- public void testNoFilterOutlinks() throws Exception {
- filter.setConf(conf);
-
- Outlink[] outlinks = generateOutlinks();
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals("All outlinks must be indexed even those from the same host",
- outlinks.length, doc.getField("outlinks").getValues().size());
- }
-
- @Test
- public void testNoFilterInlinks() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
- filter.setConf(conf);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com", "test"));
- inlinks.add(new Inlink("http://www.example.com", "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- Assert.assertEquals("All inlinks must be indexed even those from the same host",
- inlinks.size(), doc.getField("inlinks").getValues().size());
- }
-
- @Test
- public void testIndexOnlyHostPart() throws Exception {
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
- conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
- conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
- filter.setConf(conf);
-
- Outlink[] outlinks = generateOutlinks(true);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
- inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
- inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
- "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- NutchField docOutlinks = doc.getField("outlinks");
-
- Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
- new URL("http://www.test.com").getHost(),
- docOutlinks.getValues().get(0));
-
- Assert.assertEquals(
- "The inlinks coming from the same host must count only once", 1,
- doc.getField("inlinks").getValues().size());
-
- Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
- new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
- }
-
- @Test
- public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
- conf = NutchConfiguration.create();
- conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
- conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-
- Outlink[] outlinks = generateOutlinks(true);
-
- filter.setConf(conf);
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", outlinks, metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
- Assert.assertEquals(
- "Index only the host portion of the outlinks after filtering",
- new URL("http://www.test.com").getHost(),
- doc.getFieldValue("outlinks"));
- }
-
- @Test
- public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
- conf = NutchConfiguration.create();
- conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
- conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-
- filter.setConf(conf);
-
- Inlinks inlinks = new Inlinks();
- inlinks.add(new Inlink("http://www.test.com", "test"));
- inlinks.add(new Inlink("http://www.example.com", "example"));
-
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
- new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
- Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
- Assert.assertEquals(
- "Index only the host portion of the inlinks after filtering",
- new URL("http://www.test.com").getHost(),
- doc.getFieldValue("inlinks"));
-
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
deleted file mode 100644
index aaaedbf..0000000
--- a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse;
-
-import org.junit.Test;
-
-import java.util.HashSet;
-import java.util.Set;
-
-import static org.junit.Assert.*;
-
-public class TestOutlinks {
-
- @Test
- public void testAddSameObject() throws Exception {
- Set<Outlink> set = new HashSet<>();
-
- Outlink o = new Outlink("http://www.example.com", "Example");
- set.add(o);
- set.add(o);
-
- assertEquals("Adding the same Outlink twice", 1, set.size());
- }
-
- @Test
- public void testAddOtherObjectWithSameData() throws Exception {
- Set<Outlink> set = new HashSet<>();
-
- Outlink o = new Outlink("http://www.example.com", "Example");
- Outlink o1 = new Outlink("http://www.example.com", "Example");
-
- assertTrue("The two Outlink objects are the same", o.equals(o1));
-
- set.add(o);
- set.add(o1);
-
- assertEquals("The set should contain only 1 Outlink", 1, set.size());
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
new file mode 100644
index 0000000..f918dde
--- /dev/null
+++ b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMoreIndexingFilter {
+
+ @Test
+ public void testContentType() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ assertContentType(conf, "text/html", "text/html");
+ assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+ }
+
+ @Test
+ public void testGetParts() {
+ String[] parts = MoreIndexingFilter.getParts("text/html");
+ assertParts(parts, 2, "text", "html");
+ }
+
+ /**
+ * @since NUTCH-901
+ */
+ @Test
+ public void testNoParts() {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ Assert.assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+
+ try {
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+ new CrawlDatum(), new Inlinks());
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+ Assert.assertNotNull(doc);
+ Assert.assertTrue(doc.getFieldNames().contains("type"));
+ Assert.assertEquals(1, doc.getField("type").getValues().size());
+ Assert.assertEquals("text/html", doc.getFieldValue("type"));
+ }
+
+ @Test
+ public void testContentDispositionTitle() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+
+ Text url = new Text("http://www.example.com/");
+ ParseImpl parseImpl = new ParseImpl("text", new ParseData(
+ new ParseStatus(), "title", new Outlink[0], metadata));
+
+ NutchDocument doc = new NutchDocument();
+ doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals("content-disposition not detected", "filename.ext",
+ doc.getFieldValue("title"));
+
+ /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
+ doc = new NutchDocument();
+ doc.add("title", "title");
+ doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+ Assert.assertEquals("do not add second title by content-disposition",
+ "title", doc.getFieldValue("title"));
+ }
+
+ private void assertParts(String[] parts, int count, String... expected) {
+ Assert.assertEquals(count, parts.length);
+ for (int i = 0; i < expected.length; i++) {
+ Assert.assertEquals(expected[i], parts[i]);
+ }
+ }
+
+ private void assertContentType(Configuration conf, String source,
+ String expected) throws IndexingException {
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_TYPE, source);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
+ "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+ metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
+ new Inlinks());
+ Assert.assertEquals("mime type not detected", expected,
+ doc.getFieldValue("type"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
deleted file mode 100644
index f918dde..0000000
--- a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.more;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestMoreIndexingFilter {
-
- @Test
- public void testContentType() throws IndexingException {
- Configuration conf = NutchConfiguration.create();
- assertContentType(conf, "text/html", "text/html");
- assertContentType(conf, "text/html; charset=UTF-8", "text/html");
- }
-
- @Test
- public void testGetParts() {
- String[] parts = MoreIndexingFilter.getParts("text/html");
- assertParts(parts, 2, "text", "html");
- }
-
- /**
- * @since NUTCH-901
- */
- @Test
- public void testNoParts() {
- Configuration conf = NutchConfiguration.create();
- conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
- MoreIndexingFilter filter = new MoreIndexingFilter();
- filter.setConf(conf);
- Assert.assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- ParseImpl parse = new ParseImpl("foo bar", new ParseData());
-
- try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
- new CrawlDatum(), new Inlinks());
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.getMessage());
- }
- Assert.assertNotNull(doc);
- Assert.assertTrue(doc.getFieldNames().contains("type"));
- Assert.assertEquals(1, doc.getField("type").getValues().size());
- Assert.assertEquals("text/html", doc.getFieldValue("type"));
- }
-
- @Test
- public void testContentDispositionTitle() throws IndexingException {
- Configuration conf = NutchConfiguration.create();
-
- Metadata metadata = new Metadata();
- metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
- MoreIndexingFilter filter = new MoreIndexingFilter();
- filter.setConf(conf);
-
- Text url = new Text("http://www.example.com/");
- ParseImpl parseImpl = new ParseImpl("text", new ParseData(
- new ParseStatus(), "title", new Outlink[0], metadata));
-
- NutchDocument doc = new NutchDocument();
- doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
-
- Assert.assertEquals("content-disposition not detected", "filename.ext",
- doc.getFieldValue("title"));
-
- /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
- doc = new NutchDocument();
- doc.add("title", "title");
- doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
- Assert.assertEquals("do not add second title by content-disposition",
- "title", doc.getFieldValue("title"));
- }
-
- private void assertParts(String[] parts, int count, String... expected) {
- Assert.assertEquals(count, parts.length);
- for (int i = 0; i < expected.length; i++) {
- Assert.assertEquals(expected[i], parts[i]);
- }
- }
-
- private void assertContentType(Configuration conf, String source,
- String expected) throws IndexingException {
- Metadata metadata = new Metadata();
- metadata.add(Response.CONTENT_TYPE, source);
- MoreIndexingFilter filter = new MoreIndexingFilter();
- filter.setConf(conf);
- NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
- "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
- metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
- new Inlinks());
- Assert.assertEquals("mime type not detected", expected,
- doc.getFieldValue("type"));
- }
-}