You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@nutch.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/07/19 12:54:00 UTC

[jira] [Commented] (NUTCH-2152) CommonCrawl dump via Service endpoint

    [ https://issues.apache.org/jira/browse/NUTCH-2152?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16549229#comment-16549229 ] 

ASF GitHub Bot commented on NUTCH-2152:
---------------------------------------

sebastian-nagel closed pull request #355: NUTCH-2152 CommonCrawl dump via Service endpoint
URL: https://github.com/apache/nutch/pull/355
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/service/NutchServer.java b/src/java/org/apache/nutch/service/NutchServer.java
index 335977ec5..be5653b47 100644
--- a/src/java/org/apache/nutch/service/NutchServer.java
+++ b/src/java/org/apache/nutch/service/NutchServer.java
@@ -52,6 +52,7 @@
 import org.apache.nutch.service.resources.JobResource;
 import org.apache.nutch.service.resources.ReaderResouce;
 import org.apache.nutch.service.resources.SeedResource;
+import org.apache.nutch.service.resources.ServicesResource;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -138,6 +139,7 @@ private void start() {
     resources.add(AdminResource.class);
     resources.add(SeedResource.class);
     resources.add(ReaderResouce.class);
+    resources.add(ServicesResource.class);
     return resources;
   }
 
diff --git a/src/java/org/apache/nutch/service/impl/ServiceWorker.java b/src/java/org/apache/nutch/service/impl/ServiceWorker.java
new file mode 100644
index 000000000..f63fd414a
--- /dev/null
+++ b/src/java/org/apache/nutch/service/impl/ServiceWorker.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.impl;
+
+import java.lang.invoke.MethodHandles;
+
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.service.model.request.ServiceConfig;
+import org.apache.nutch.util.NutchTool;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ServiceWorker implements Runnable {
+
+  private ServiceConfig serviceConfig;
+  private NutchTool tool;
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  public ServiceWorker(ServiceConfig serviceConfig, NutchTool tool) {
+    this.serviceConfig = serviceConfig;
+    this.tool = tool;
+  }
+
+  @Override
+  public void run() {
+    try {
+      tool.run(serviceConfig.getArgs(), serviceConfig.getCrawlId());
+    } catch (Exception e) {
+      // TODO Auto-generated catch block
+      LOG.error("Error running service worker : {}",
+          StringUtils.stringifyException(e));
+    }
+  }
+
+}
diff --git a/src/java/org/apache/nutch/service/model/request/ServiceConfig.java b/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
new file mode 100644
index 000000000..ab88491cc
--- /dev/null
+++ b/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.model.request;
+
+import java.util.Map;
+
+public class ServiceConfig {
+
+  private String crawlId;
+  private String confId;
+  private Map<String, Object> args;
+
+  public String getCrawlId() {
+    return crawlId;
+  }
+
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+
+  public String getConfId() {
+    return confId;
+  }
+
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+
+  public Map<String, Object> getArgs() {
+    return args;
+  }
+
+  public void setArgs(Map<String, Object> args) {
+    this.args = args;
+  }
+
+}
diff --git a/src/java/org/apache/nutch/service/model/response/ServiceInfo.java b/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
new file mode 100644
index 000000000..655e3f81b
--- /dev/null
+++ b/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.model.response;
+
+import java.util.List;
+
+public class ServiceInfo {
+
+  private List<String> dumpPaths;
+
+  public List<String> getDumpPaths() {
+    return dumpPaths;
+  }
+
+  public void setDumpPaths(List<String> dumpPaths) {
+    this.dumpPaths = dumpPaths;
+  }
+}
diff --git a/src/java/org/apache/nutch/service/resources/ServicesResource.java b/src/java/org/apache/nutch/service/resources/ServicesResource.java
new file mode 100644
index 000000000..e4224f173
--- /dev/null
+++ b/src/java/org/apache/nutch/service/resources/ServicesResource.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.resources;
+
+import java.io.File;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.service.impl.ServiceWorker;
+import org.apache.nutch.service.model.request.ServiceConfig;
+import org.apache.nutch.service.model.response.ServiceInfo;
+import org.apache.nutch.tools.CommonCrawlDataDumper;
+
+/**
+ * The services resource defines an endpoint to enable the user to carry out
+ * Nutch jobs like dump, commoncrawldump, etc.
+ */
+@Path("/services")
+public class ServicesResource {
+
+  private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
+
+  @GET
+  @Path("/commoncrawldump/{crawlId}")
+  public Response listDumpPaths(@PathParam("crawlId") String crawlId) {
+    File dumpFilePath = new File(crawlId + File.separator + "dump/");
+    File dumpFileList[] = dumpFilePath.listFiles();
+    List<String> fileNames = new ArrayList<>();
+    if (dumpFileList != null) {
+      for (File f : dumpFileList) {
+        fileNames.add(f.getPath());
+      }
+    }
+    ServiceInfo info = new ServiceInfo();
+    info.setDumpPaths(fileNames);
+    return Response.ok().entity(info).type(MediaType.APPLICATION_JSON).build();
+  }
+
+  @POST
+  @Path("/commoncrawldump")
+  public Response commoncrawlDump(ServiceConfig serviceConfig) {
+    String crawlId = serviceConfig.getCrawlId();
+    String outputDir = crawlId + File.separator + "dump" + File.separator
+        + "commoncrawl-" + sdf.format(System.currentTimeMillis());
+
+    Map<String, Object> args = serviceConfig.getArgs();
+    args.put("outputDir", outputDir);
+    if (!args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+      args.put("segment", crawlId + File.separator + "segments");
+    }
+    serviceConfig.setArgs(args);
+    ServiceWorker worker = new ServiceWorker(serviceConfig,
+        new CommonCrawlDataDumper());
+    worker.run();
+
+    return Response.ok(outputDir).type(MediaType.TEXT_PLAIN).build();
+  }
+
+}
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index 3fbe2a7a6..80adc05ad 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -53,7 +53,6 @@
 import org.apache.commons.io.FilenameUtils;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
@@ -68,9 +67,12 @@
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.crawl.LinkDbReader;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchTool;
+
 import org.apache.tika.Tika;
 
 import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
@@ -173,7 +175,7 @@
  * }
  * </pre>
  */
-public class CommonCrawlDataDumper extends Configured implements Tool {
+public class CommonCrawlDataDumper extends NutchTool implements Tool {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
@@ -709,4 +711,72 @@ public int run(String[] args) throws Exception {
 
     return 0;
   }
+
+  /**
+   * Used by the REST service
+   */
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId)
+      throws Exception {
+
+    String keyPrefix = args.containsKey("keyPrefix")
+        ? (String) args.get("keyPrefix")
+        : "";
+
+    File outputDir = new File((String) args.get("outputDir"));
+    File segmentRootDir = new File((String) args.get(Nutch.ARG_SEGMENTDIR));
+    ArrayList<String> mimeTypesList = args.containsKey("mimetypes")
+        ? (ArrayList<String>) args.get("mimetypes")
+        : null;
+    String[] mimeTypes = null;
+    if (mimeTypesList != null) {
+      mimeTypes = new String[mimeTypesList.size()];
+      int i = 0;
+      for (String m : mimeTypesList)
+        mimeTypes[i++] = m;
+    }
+    boolean gzip = args.containsKey("gzip") ? (boolean) args.get("gzip")
+        : false;
+    boolean epochFilename = args.containsKey("epochFilename")
+        ? (boolean) args.get("epochFilename")
+        : false;
+
+    boolean simpleDateFormat = args.containsKey("simpleDateFormat")
+        ? (boolean) args.get("simpleDateFormat")
+        : false;
+    boolean jsonArray = args.containsKey("jsonArray")
+        ? (boolean) args.get("jsonArray")
+        : false;
+    boolean reverseKey = args.containsKey("reverseKey")
+        ? (boolean) args.get("reverseKey")
+        : false;
+    String extension = args.containsKey("extension")
+        ? (String) args.get("extension")
+        : "";
+    boolean warc = args.containsKey("warc") ? (boolean) args.get("warc")
+        : false;
+    long warcSize = args.containsKey("warcSize") ? (Long) args.get("warcSize")
+        : 0;
+
+    CommonCrawlConfig config = new CommonCrawlConfig();
+    config.setKeyPrefix(keyPrefix);
+    config.setSimpleDateFormat(simpleDateFormat);
+    config.setJsonArray(jsonArray);
+    config.setReverseKey(reverseKey);
+    config.setCompressed(gzip);
+    config.setWarcSize(warcSize);
+    config.setOutputDir((String) args.get("outputDir"));
+
+    if (!outputDir.exists()) {
+      if (!outputDir.mkdirs())
+        throw new Exception(
+            "Unable to create: [" + outputDir.getAbsolutePath() + "]");
+    }
+
+    CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);
+
+    dumper.dump(outputDir, segmentRootDir, null, gzip, mimeTypes, epochFilename,
+        extension, warc);
+    return null;
+  }
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> CommonCrawl dump via Service endpoint
> -------------------------------------
>
>                 Key: NUTCH-2152
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2152
>             Project: Nutch
>          Issue Type: Sub-task
>          Components: REST_api
>    Affects Versions: 1.12
>            Reporter: Sujen Shah
>            Assignee: Sujen Shah
>            Priority: Major
>              Labels: memex
>             Fix For: 1.15
>
>         Attachments: NUTCH-2152.git.patch
>
>




--
This message was sent by Atlassian JIRA
(v7.6.3#76005)