You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by su...@apache.org on 2016/10/25 16:02:22 UTC

nutch git commit: Fix for NUTCH-2327: Seeds injected in REST must be ingested into HDFS, this closes #155

Repository: nutch
Updated Branches:
  refs/heads/master 9092e233f -> 24cc2aa9c


Fix for NUTCH-2327: Seeds injected in REST must be ingested into HDFS, this closes #155



Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/24cc2aa9
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/24cc2aa9
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/24cc2aa9

Branch: refs/heads/master
Commit: 24cc2aa9c68fa356e4e926b6bf86bac99d52e38c
Parents: 9092e23
Author: Sujen Shah <su...@gmail.com>
Authored: Tue Oct 18 21:36:27 2016 -0700
Committer: Sujen Shah <su...@apache.org>
Committed: Tue Oct 25 09:02:00 2016 -0700

----------------------------------------------------------------------
 .../nutch/service/resources/SeedResource.java   | 75 +++++++-------------
 1 file changed, 27 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/24cc2aa9/src/java/org/apache/nutch/service/resources/SeedResource.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java
index 638af33..61a0526 100644
--- a/src/java/org/apache/nutch/service/resources/SeedResource.java
+++ b/src/java/org/apache/nutch/service/resources/SeedResource.java
@@ -16,13 +16,7 @@
  */
 package org.apache.nutch.service.resources;
 
-import static javax.ws.rs.core.Response.status;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.IOException;
+import java.io.OutputStream;
 import java.util.Collection;
 import java.util.Map;
 
@@ -31,19 +25,19 @@ import javax.ws.rs.GET;
 import javax.ws.rs.POST;
 import javax.ws.rs.Path;
 import javax.ws.rs.Produces;
-import javax.ws.rs.WebApplicationException;
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 import javax.ws.rs.core.Response.Status;
 
 import org.apache.commons.collections.CollectionUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.service.NutchServer;
 import org.apache.nutch.service.model.request.SeedList;
 import org.apache.nutch.service.model.request.SeedUrl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.io.Files;
 
 @Path("/seed")
 public class SeedResource extends AbstractResource {
@@ -77,58 +71,43 @@ public class SeedResource extends AbstractResource {
   @Consumes(MediaType.APPLICATION_JSON)
   @Produces(MediaType.TEXT_PLAIN)
   public Response createSeedFile(SeedList seedList) {
+    try {
     if (seedList == null) {
       return Response.status(Status.BAD_REQUEST)
           .entity("Seed list cannot be empty!").build();
     }
-    File seedFile = createSeedFile();
-    BufferedWriter writer = getWriter(seedFile);
-
     Collection<SeedUrl> seedUrls = seedList.getSeedUrls();
-    if (CollectionUtils.isNotEmpty(seedUrls)) {
-      for (SeedUrl seedUrl : seedUrls) {
-        writeUrl(writer, seedUrl);
-      }
-    }
-    String seedFilePath = seedFile.getParent();
+    
+    String seedFilePath = writeToSeedFile(seedUrls);
     seedList.setSeedFilePath(seedFilePath);
     NutchServer.getInstance().getSeedManager().
           setSeedList(seedList.getName(), seedList);
     return Response.ok().entity(seedFilePath).build();
-  }
-
-  private void writeUrl(BufferedWriter writer, SeedUrl seedUrl) {
-    try {
-      writer.write(seedUrl.getUrl());
-      writer.newLine();
-      writer.flush();
-    } catch (IOException e) {
-      throw handleException(e);
+    } catch (Exception e) {
+      log.warn("Error while creating seed : {}", e.getMessage());
     }
+    return Response.serverError().build();
   }
 
-  private BufferedWriter getWriter(File seedFile) {
-    try {
-      return new BufferedWriter(new FileWriter(seedFile));
-    } catch (FileNotFoundException e) {
-      throw handleException(e);
-    } catch (IOException e) {
-      throw handleException(e);
+  private String writeToSeedFile(Collection<SeedUrl> seedUrls) throws Exception {
+    String seedFilePath = "seedFiles/seed-" + System.currentTimeMillis();
+    org.apache.hadoop.fs.Path seedFolder = new org.apache.hadoop.fs.Path(seedFilePath);
+    FileSystem fs = FileSystem.get(new Configuration());
+    if(!fs.exists(seedFolder)) {
+      if(!fs.mkdirs(seedFolder)) {
+        throw new Exception("Could not create seed folder at : " + seedFolder);
+      }
     }
-  }
-
-  private File createSeedFile() {
-    try {
-      return File.createTempFile("seed", ".txt", Files.createTempDir());
-    } catch (IOException e) {
-      throw handleException(e);
+    String filename = seedFilePath + System.getProperty("file.separator") + "urls";
+    org.apache.hadoop.fs.Path seedPath = new org.apache.hadoop.fs.Path(filename);
+    OutputStream os = fs.create(seedPath);
+    if (CollectionUtils.isNotEmpty(seedUrls)) {
+      for (SeedUrl seedUrl : seedUrls) {
+        os.write(seedUrl.getUrl().getBytes());
+        os.write("\n".getBytes());
+      }
     }
+    os.close();
+    return seedPath.getParent().toString();
   }
-
-  private RuntimeException handleException(Exception e) {
-    log.error("Cannot create seed file!", e);
-    return new WebApplicationException(status(Status.INTERNAL_SERVER_ERROR)
-        .entity("Cannot create seed file!").build());
-  }
-
 }