You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/19 11:25:47 UTC
svn commit: r745808 - in /lucene/nutch/trunk: CHANGES.txt
src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
Author: siren
Date: Thu Feb 19 10:25:47 2009
New Revision: 745808
URL: http://svn.apache.org/viewvc?rev=745808&view=rev
Log:
NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin, contributed by Dmitry Lihachev
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745808&r1=745807&r2=745808&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Feb 19 10:25:47 2009
@@ -348,6 +348,9 @@
130. NUTCH-563 - Include custom fields in BasicQueryFilter
(Julien Nioche via siren)
+
+131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
+ (Dmitry Lihachev via siren)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Feb 19 10:25:47 2009
@@ -199,20 +199,20 @@
MimeType mimeType = null;
String contentType = data.getMeta(Response.CONTENT_TYPE);
if (contentType == null) {
- // Note by Jerome Charron on 20050415:
- // Content Type not solved by a previous plugin
- // Or unable to solve it... Trying to find it
- // Should be better to use the doc content too
- // (using MimeTypes.getMimeType(byte[], String), but I don't know
- // which field it is?
- // if (MAGIC) {
- // contentType = MIME.getMimeType(url, content);
- // } else {
- // contentType = MIME.getMimeType(url);
- // }
- mimeType = MIME.getMimeType(url);
+ // Note by Jerome Charron on 20050415:
+ // Content Type not solved by a previous plugin
+ // Or unable to solve it... Trying to find it
+ // Should be better to use the doc content too
+ // (using MimeTypes.getMimeType(byte[], String), but I don't know
+ // which field it is?
+ // if (MAGIC) {
+ // contentType = MIME.getMimeType(url, content);
+ // } else {
+ // contentType = MIME.getMimeType(url);
+ // }
+ mimeType = MIME.getMimeType(url);
} else {
- mimeType = MIME.forName(contentType);
+ mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
}
// Checks if we solved the content-type.
Modified: lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=745808&r1=745807&r2=745808&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Thu Feb 19 10:25:47 2009
@@ -16,10 +16,30 @@
*/
package org.apache.nutch.indexer.more;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
import junit.framework.TestCase;
public class TestMoreIndexingFilter extends TestCase {
+ public void testContentType() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ assertContentType(conf, "text/html", "text/html");
+ assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+ }
+
public void testGetParts() {
String[] parts = MoreIndexingFilter.getParts("text/html");
assertParts(parts, 2, "text", "html");
@@ -32,4 +52,15 @@
assertEquals(expected[i], parts[i]);
}
}
+
+ private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_TYPE, source);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
+ new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
+ "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+ assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
+ }
}