You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jo...@apache.org on 2015/02/23 03:53:25 UTC
svn commit: r1661600 - in /nutch/trunk: ./ conf/ src/plugin/
src/plugin/mimetype-filter/ src/plugin/mimetype-filter/sample/
src/plugin/mimetype-filter/src/ src/plugin/mimetype-filter/src/java/
src/plugin/mimetype-filter/src/java/org/ src/plugin/mimetyp...
Author: jorgelbg
Date: Mon Feb 23 02:53:24 2015
New Revision: 1661600
URL: http://svn.apache.org/r1661600
Log:
NUTCH-1928 Indexing filter of documents by the MIME type
Added:
nutch/trunk/src/plugin/mimetype-filter/
nutch/trunk/src/plugin/mimetype-filter/build.xml
nutch/trunk/src/plugin/mimetype-filter/ivy.xml
nutch/trunk/src/plugin/mimetype-filter/plugin.xml
nutch/trunk/src/plugin/mimetype-filter/sample/
nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt
nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt
nutch/trunk/src/plugin/mimetype-filter/src/
nutch/trunk/src/plugin/mimetype-filter/src/java/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
nutch/trunk/src/plugin/mimetype-filter/src/test/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
Modified:
nutch/trunk/build.xml
nutch/trunk/conf/nutch-default.xml
nutch/trunk/default.properties
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Feb 23 02:53:24 2015
@@ -178,6 +178,7 @@
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-static/src/java"/>
+ <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
@@ -584,6 +585,7 @@
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/index-static/src/java"/>
+ <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
@@ -969,6 +971,8 @@
<source path="${plugins.dir}/index-basic/src/test/" />
<source path="${plugins.dir}/index-geoip/src/java/" />
<source path="${plugins.dir}/index-geoip/src/test/" />
+ <source path="${plugins.dir}/mimetype-filter/src/java/" />
+ <source path="${plugins.dir}/mimetype-filter/src/test/" />
<source path="${plugins.dir}/indexer-dummy/src/java/" />
<source path="${plugins.dir}/indexer-solr/src/java/" />
<source path="${plugins.dir}/indexer-elastic/src/java/" />
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Feb 23 02:53:24 2015
@@ -1602,4 +1602,15 @@
<description>Whether to support multivalued headings.</description>
</property>
+<!-- mimetype-filter plugin properties -->
+
+<property>
+ <name>mimetype.filter.file</name>
+ <value>mimetype-filter.txt</value>
+ <description>
+ The configuration file for the mimetype-filter plugin. This file contains
+ the rules used to allow or deny the indexing of certain documents.
+ </description>
+</property>
+
</configuration>
Modified: nutch/trunk/default.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Mon Feb 23 02:53:24 2015
@@ -148,6 +148,7 @@ plugins.index=\
org.apache.nutch.indexer.basic*:\
org.apache.nutch.indexer.feed*:\
org.apache.nutch.indexer.geoip*:\
+ org.apache.nutch.indexer.filter*:\
org.apache.nutch.indexer.metadata*:\
org.apache.nutch.indexer.more*:\
org.apache.nutch.indexer.static*:\
Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Mon Feb 23 02:53:24 2015
@@ -35,6 +35,7 @@
<ant dir="index-more" target="deploy"/>
<ant dir="index-static" target="deploy"/>
<ant dir="index-metadata" target="deploy"/>
+ <ant dir="mimetype-filter" target="deploy"/>
<ant dir="indexer-dummy" target="deploy"/>
<ant dir="indexer-elastic" target="deploy"/>
<ant dir="indexer-solr" target="deploy"/>
@@ -88,6 +89,7 @@
<ant dir="index-geoip" target="test"/>
<ant dir="index-more" target="test"/>
<ant dir="index-static" target="test"/>
+ <ant dir="mimetype-filter" target="test"/>
<ant dir="language-identifier" target="test"/>
<ant dir="lib-http" target="test"/>
<ant dir="protocol-file" target="test"/>
@@ -126,10 +128,11 @@
<ant dir="headings" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-anchor" target="clean"/>
- <ant dir="index-geoip" target="clean"/>
+ <ant dir="index-geoip" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="index-static" target="clean"/>
<ant dir="index-metadata" target="clean"/>
+ <ant dir="mimetype-filter" target="clean"/>
<ant dir="indexer-dummy" target="clean"/>
<ant dir="indexer-elastic" target="clean"/>
<ant dir="indexer-solr" target="clean"/>
Added: nutch/trunk/src/plugin/mimetype-filter/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/build.xml?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/build.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/build.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="mimetype-filter" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample" includes="**/*.txt"/>
+ </copy>
+
+</project>
Added: nutch/trunk/src/plugin/mimetype-filter/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/ivy.xml?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/ivy.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/ivy.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/mimetype-filter/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/plugin.xml?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/plugin.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/plugin.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="mimetype-filter"
+ name="Filter indexed documents by the detected MIME"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="mimetype-filter.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.indexer.filter"
+ name="Nutch MIME filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="MimeTypeIndexingFilter"
+ class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
+ </extension>
+
+</plugin>
Added: nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt (added)
+++ nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt Mon Feb 23 02:53:24 2015
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
+-
+
+image
Added: nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt (added)
+++ nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt Mon Feb 23 02:53:24 2015
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
++
+
+text/html
\ No newline at end of file
Added: nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java (added)
+++ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java Mon Feb 23 02:53:24 2015
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+import org.apache.tika.Tika;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
+ * of documents based on the MIME Type detected by Tika
+ *
+ */
+public class MimeTypeIndexingFilter implements IndexingFilter {
+
+ public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MimeTypeIndexingFilter.class);
+
+ private MimeUtil MIME;
+ private Tika tika = new Tika();
+
+ private TrieStringMatcher trie;
+
+ private Configuration conf;
+
+ private boolean acceptMode = true;
+
+ // Inherited JavaDoc
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ String mimeType;
+ String contentType;
+
+ Writable tcontentType = datum.getMetaData()
+ .get(new Text(Response.CONTENT_TYPE));
+
+ if (tcontentType != null) {
+ contentType = tcontentType.toString();
+ } else {
+ contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
+ }
+
+ if (contentType == null) {
+ mimeType = tika.detect(url.toString());
+ } else {
+ mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+ }
+
+ contentType = mimeType;
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info(String.format("[%s] %s", contentType, url));
+ }
+
+ if (null != trie) {
+ if (trie.shortestMatch(contentType) == null) {
+ // no match, but
+ if (acceptMode) {
+ return doc;
+ }
+ return null;
+ } else {
+ // matched, but we are blocking
+ if (acceptMode) {
+ return null;
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ /*
+ * -----------------------------
+ * <implementation:Configurable> *
+ * -----------------------------
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ MIME = new MimeUtil(conf);
+
+ // load the file of the values
+ String file = conf.get(MIMEFILTER_REGEX_FILE, "");
+
+ if (file != null) {
+ if (file.isEmpty()) {
+ LOG.warn(String
+ .format("Missing %s property, ALL mimetypes will be allowed",
+ MIMEFILTER_REGEX_FILE));
+ } else {
+ Reader reader = conf.getConfResourceAsReader(file);
+
+ try {
+ readConfiguration(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+ }
+
+ private void readConfiguration(Reader reader) throws IOException {
+ BufferedReader in = new BufferedReader(reader);
+ String line;
+ List rules = new ArrayList();
+
+ while (null != (line = in.readLine())) {
+ if (line.length() == 0) {
+ continue;
+ }
+
+ char first = line.charAt(0);
+ switch (first) {
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ break;
+ case '+':
+ acceptMode = true;
+ break;
+ case '-':
+ acceptMode = false;
+ break;
+ default:
+ rules.add(line);
+ break;
+ }
+ }
+
+ trie = new PrefixStringMatcher(rules);
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+ /*
+ * ------------------------------ * </implementation:Configurable> *
+ * ------------------------------
+ */
+}
+
Added: nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java (added)
+++ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java Mon Feb 23 02:53:24 2015
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
+ *
+ */
+public class MimeTypeIndexingFilterTest {
+
+ private Configuration conf = NutchConfiguration.create();
+ private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+ private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
+ private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ @Before
+ public void setUp() throws Exception {
+ for (int i = 0; i < MIME_TYPES.length; i++) {
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
+
+ ParseImpl parse = new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+ parses[i] = parse;
+ }
+ }
+
+ @Test
+ public void testMissingConfigFile() throws Exception {
+ String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
+ Assert.assertEquals(String
+ .format("Property %s must not be present in the the configuration file",
+ MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
+
+ filter.setConf(conf);
+
+ // property not set so in this cases all documents must pass the filter
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertNotNull("All documents must be allowed by default", doc);
+ }
+ }
+
+ @Test
+ public void testAllowOnlyImages() throws Exception {
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
+ filter.setConf(conf);
+
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (MIME_TYPES[i].contains("image")) {
+ Assert.assertNotNull("Allow only images", doc);
+ } else {
+ Assert.assertNull("Block everything else", doc);
+ }
+ }
+ }
+
+ @Test
+ public void testBlockHTML() throws Exception {
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
+ filter.setConf(conf);
+
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (MIME_TYPES[i].contains("html")) {
+ Assert.assertNull("Block only HTML documents", doc);
+ } else {
+ Assert.assertNotNull("Allow everything else", doc);
+ }
+ }
+ }
+}