You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2007/03/10 07:52:34 UTC

svn commit: r516660 - in /lucene/nutch/trunk: ./ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ src/plugin/protocol-file/src/test/ src/plugin/protocol-file/src/test/org/ src/plugin/protocol-file/src/test/org/apache/ src/plugin/protoc...

Author: mattmann
Date: Fri Mar  9 22:52:31 2007
New Revision: 516660

URL: http://svn.apache.org/viewvc?view=rev&rev=516660
Log:
fix for NUTCH-384 (contributed by Heiko Dietze)

Added:
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
    lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516660&r1=516659&r2=516660
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Mar  9 22:52:31 2007
@@ -155,6 +155,9 @@
 
 52. NUTCH-167 - Observation of robots "noarchive" directive. (ab)
 
+53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
+    framework to operate properly (Heiko Dietze via mattmann)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?view=diff&rev=516660&r1=516659&r2=516660
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Fri Mar  9 22:52:31 2007
@@ -34,6 +34,7 @@
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.util.NutchConfiguration;
 
 import java.net.URL;
 
@@ -100,6 +101,7 @@
         }
       } 
     } catch (Exception e) {
+      e.printStackTrace();
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
@@ -137,6 +139,7 @@
     }
 
     File file = new File();
+    file.setConf(NutchConfiguration.create());
 
     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
       file.setMaxContentLength(maxContentLength);

Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?view=diff&rev=516660&r1=516659&r2=516660
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Fri Mar  9 22:52:31 2007
@@ -26,6 +26,8 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 
@@ -191,9 +193,12 @@
 
     // set headers
     headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
-    headers.set(Response.LAST_MODIFIED,
-      this.file.httpDateFormat.toString(f.lastModified()));
-    headers.set(Response.CONTENT_TYPE, "");   // No Content-Type at file protocol level
+    headers.set(Response.LAST_MODIFIED, this.file.httpDateFormat.toString(f
+        .lastModified()));
+    MimeTypes mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
+    MimeType mimeType = mimeTypes.getMimeType(f);
+    String mimeTypeString = mimeType != null ? mimeType.getName() : "";
+    headers.set(Response.CONTENT_TYPE, mimeTypeString);
 
     // response code
     this.code = 200; // http OK

Added: lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?view=auto&rev=516660
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java (added)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java Fri Mar  9 22:52:31 2007
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// Hadoop imports
+import org.apache.hadoop.io.Text;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Junit imports
+import junit.framework.TestCase;
+
+/**
+ * @author mattmann
+ * @version $Revision$
+ * 
+ * <p>
+ * Unit tests for the {@link File}Protocol.
+ * </p>.
+ */
+public class TestProtocolFile extends TestCase {
+
+  private static final org.apache.nutch.protocol.file.File fileProtocol = 
+    new org.apache.nutch.protocol.file.File();
+
+  private static final String testTextFile = "testprotocolfile.txt";
+
+  private static final CrawlDatum datum = new CrawlDatum();
+
+  private static final String expectedMimeType = "text/plain";
+
+  static {
+    fileProtocol.setConf(NutchConfiguration.create());
+  }
+
+  /**
+   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata
+   * field.
+   * 
+   * @since NUTCH-384
+   * 
+   */
+  public void testSetContentType() {
+    Text fileUrl = new Text(this.getClass().getResource(testTextFile)
+        .toString());
+    assertNotNull(fileUrl);
+    ProtocolOutput output = fileProtocol.getProtocolOutput(fileUrl, datum);
+    assertNotNull(output);
+    assertEquals("Status code: [" + output.getStatus().getCode()
+        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
+        + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
+        .getStatus().getCode());
+    assertNotNull(output.getContent());
+    assertNotNull(output.getContent().getContentType());
+    assertEquals(expectedMimeType, output.getContent().getContentType());
+    assertNotNull(output.getContent().getMetadata());
+    assertEquals(expectedMimeType, output.getContent().getMetadata().get(
+        Response.CONTENT_TYPE));
+
+  }
+
+}

Added: lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt?view=auto&rev=516660
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt (added)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt Fri Mar  9 22:52:31 2007
@@ -0,0 +1 @@
+Protocol File Test