You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jo...@apache.org on 2015/02/23 03:53:25 UTC

svn commit: r1661600 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/mimetype-filter/ src/plugin/mimetype-filter/sample/ src/plugin/mimetype-filter/src/ src/plugin/mimetype-filter/src/java/ src/plugin/mimetype-filter/src/java/org/ src/plugin/mimetyp...

Author: jorgelbg
Date: Mon Feb 23 02:53:24 2015
New Revision: 1661600

URL: http://svn.apache.org/r1661600
Log:
NUTCH-1928 Indexing filter of documents by the MIME type


Added:
    nutch/trunk/src/plugin/mimetype-filter/
    nutch/trunk/src/plugin/mimetype-filter/build.xml
    nutch/trunk/src/plugin/mimetype-filter/ivy.xml
    nutch/trunk/src/plugin/mimetype-filter/plugin.xml
    nutch/trunk/src/plugin/mimetype-filter/sample/
    nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt
    nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt
    nutch/trunk/src/plugin/mimetype-filter/src/
    nutch/trunk/src/plugin/mimetype-filter/src/java/
    nutch/trunk/src/plugin/mimetype-filter/src/java/org/
    nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/
    nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/
    nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
    nutch/trunk/src/plugin/mimetype-filter/src/test/
    nutch/trunk/src/plugin/mimetype-filter/src/test/org/
    nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/
    nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/
    nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
Modified:
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/default.properties
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Feb 23 02:53:24 2015
@@ -178,6 +178,7 @@
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
+      <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
@@ -584,6 +585,7 @@
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
+      <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
@@ -969,6 +971,8 @@
         <source path="${plugins.dir}/index-basic/src/test/" />
         <source path="${plugins.dir}/index-geoip/src/java/" />
         <source path="${plugins.dir}/index-geoip/src/test/" />
+        <source path="${plugins.dir}/mimetype-filter/src/java/" />
+        <source path="${plugins.dir}/mimetype-filter/src/test/" />
         <source path="${plugins.dir}/indexer-dummy/src/java/" />
         <source path="${plugins.dir}/indexer-solr/src/java/" />
         <source path="${plugins.dir}/indexer-elastic/src/java/" />

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Feb 23 02:53:24 2015
@@ -1602,4 +1602,15 @@
   <description>Whether to support multivalued headings.</description>
 </property>
 
+<!-- mimetype-filter plugin properties -->
+
+<property>
+  <name>mimetype.filter.file</name>
+  <value>mimetype-filter.txt</value>
+  <description>
+    The configuration file for the mimetype-filter plugin. This file contains
+    the rules used to allow or deny the indexing of certain documents.
+  </description>
+</property>
+
 </configuration>

Modified: nutch/trunk/default.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Mon Feb 23 02:53:24 2015
@@ -148,6 +148,7 @@ plugins.index=\
    org.apache.nutch.indexer.basic*:\
    org.apache.nutch.indexer.feed*:\
    org.apache.nutch.indexer.geoip*:\
+   org.apache.nutch.indexer.filter*:\
    org.apache.nutch.indexer.metadata*:\
    org.apache.nutch.indexer.more*:\
    org.apache.nutch.indexer.static*:\

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Mon Feb 23 02:53:24 2015
@@ -35,6 +35,7 @@
      <ant dir="index-more" target="deploy"/>
      <ant dir="index-static" target="deploy"/>
      <ant dir="index-metadata" target="deploy"/>
+     <ant dir="mimetype-filter" target="deploy"/>
      <ant dir="indexer-dummy" target="deploy"/>
      <ant dir="indexer-elastic" target="deploy"/>
      <ant dir="indexer-solr" target="deploy"/>
@@ -88,6 +89,7 @@
      <ant dir="index-geoip" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="index-static" target="test"/>
+     <ant dir="mimetype-filter" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="lib-http" target="test"/>
      <ant dir="protocol-file" target="test"/>
@@ -126,10 +128,11 @@
     <ant dir="headings" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
-     <ant dir="index-geoip" target="clean"/>
+    <ant dir="index-geoip" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="index-static" target="clean"/>
     <ant dir="index-metadata" target="clean"/>
+    <ant dir="mimetype-filter" target="clean"/>
     <ant dir="indexer-dummy" target="clean"/>
     <ant dir="indexer-elastic" target="clean"/>
     <ant dir="indexer-solr" target="clean"/>

Added: nutch/trunk/src/plugin/mimetype-filter/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/build.xml?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/build.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/build.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="mimetype-filter" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+
+    <!-- for junit test -->
+    <mkdir dir="${build.test}/data"/>
+    <copy todir="${build.test}/data">
+        <fileset dir="sample" includes="**/*.txt"/>
+    </copy>
+
+</project>

Added: nutch/trunk/src/plugin/mimetype-filter/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/ivy.xml?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/ivy.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/ivy.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/mimetype-filter/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/plugin.xml?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/plugin.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/plugin.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="mimetype-filter"
+   name="Filter indexed documents by the detected MIME"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="mimetype-filter.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+    <extension id="org.apache.nutch.indexer.filter"
+               name="Nutch MIME filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="MimeTypeIndexingFilter"
+                        class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
+    </extension>
+
+</plugin>

Added: nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt (added)
+++ nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt Mon Feb 23 02:53:24 2015
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
+-
+
+image

Added: nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt (added)
+++ nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt Mon Feb 23 02:53:24 2015
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
++
+
+text/html
\ No newline at end of file

Added: nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java (added)
+++ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java Mon Feb 23 02:53:24 2015
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+import org.apache.tika.Tika;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
+ * of documents based on the MIME Type detected by Tika
+ *
+ */
+public class MimeTypeIndexingFilter implements IndexingFilter {
+
+  public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MimeTypeIndexingFilter.class);
+
+  private MimeUtil MIME;
+  private Tika tika = new Tika();
+
+  private TrieStringMatcher trie;
+
+  private Configuration conf;
+
+  private boolean acceptMode = true;
+
+  // Inherited JavaDoc
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    String mimeType;
+    String contentType;
+
+    Writable tcontentType = datum.getMetaData()
+        .get(new Text(Response.CONTENT_TYPE));
+
+    if (tcontentType != null) {
+      contentType = tcontentType.toString();
+    } else {
+      contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
+    }
+
+    if (contentType == null) {
+      mimeType = tika.detect(url.toString());
+    } else {
+      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+    }
+
+    contentType = mimeType;
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info(String.format("[%s] %s", contentType, url));
+    }
+
+    if (null != trie) {
+      if (trie.shortestMatch(contentType) == null) {
+        // no match, but
+        if (acceptMode) {
+          return doc;
+        }
+        return null;
+      } else {
+        // matched, but we are blocking
+        if (acceptMode) {
+          return null;
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  /*
+   * -----------------------------
+   * <implementation:Configurable> *
+   * -----------------------------
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    MIME = new MimeUtil(conf);
+
+    // load the file of the values
+    String file = conf.get(MIMEFILTER_REGEX_FILE, "");
+
+    if (file != null) {
+      if (file.isEmpty()) {
+        LOG.warn(String
+            .format("Missing %s property, ALL mimetypes will be allowed",
+                MIMEFILTER_REGEX_FILE));
+      } else {
+        Reader reader = conf.getConfResourceAsReader(file);
+
+        try {
+          readConfiguration(reader);
+        } catch (IOException e) {
+          if (LOG.isErrorEnabled()) {
+            LOG.error(e.getMessage());
+          }
+
+          throw new RuntimeException(e.getMessage(), e);
+        }
+      }
+    }
+  }
+
+  private void readConfiguration(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    String line;
+    List rules = new ArrayList();
+
+    while (null != (line = in.readLine())) {
+      if (line.length() == 0) {
+        continue;
+      }
+
+      char first = line.charAt(0);
+      switch (first) {
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        break;
+      case '+':
+        acceptMode = true;
+        break;
+      case '-':
+        acceptMode = false;
+        break;
+      default:
+        rules.add(line);
+        break;
+      }
+    }
+
+    trie = new PrefixStringMatcher(rules);
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+    /*
+     * ------------------------------ * </implementation:Configurable> *
+     * ------------------------------
+     */
+}
+

Added: nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java?rev=1661600&view=auto
==============================================================================
--- nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java (added)
+++ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java Mon Feb 23 02:53:24 2015
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
+ *
+ */
+public class MimeTypeIndexingFilterTest {
+
+  private Configuration conf = NutchConfiguration.create();
+  private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+  private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
+  private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  @Before
+  public void setUp() throws Exception {
+    for (int i = 0; i < MIME_TYPES.length; i++) {
+      Metadata metadata = new Metadata();
+      metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
+
+      ParseImpl parse = new ParseImpl("text",
+          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+      parses[i] = parse;
+    }
+  }
+
+  @Test
+  public void testMissingConfigFile() throws Exception {
+    String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
+    Assert.assertEquals(String
+        .format("Property %s must not be present in the the configuration file",
+            MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
+
+    filter.setConf(conf);
+
+    // property not set so in this cases all documents must pass the filter
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      Assert.assertNotNull("All documents must be allowed by default", doc);
+    }
+  }
+
+  @Test
+  public void testAllowOnlyImages() throws Exception {
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
+    filter.setConf(conf);
+
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (MIME_TYPES[i].contains("image")) {
+        Assert.assertNotNull("Allow only images", doc);
+      } else {
+        Assert.assertNull("Block everything else", doc);
+      }
+    }
+  }
+
+  @Test
+  public void testBlockHTML() throws Exception {
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
+    filter.setConf(conf);
+
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (MIME_TYPES[i].contains("html")) {
+        Assert.assertNull("Block only HTML documents", doc);
+      } else {
+        Assert.assertNotNull("Allow everything else", doc);
+      }
+    }
+  }
+}