You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by GitBox <gi...@apache.org> on 2020/05/31 17:34:30 UTC

[GitHub] [drill] dbw9580 opened a new pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

dbw9580 opened a new pull request #2084:
URL: https://github.com/apache/drill/pull/2084


   Add storage plugin for IPFS. See detailed introduction [here](https://github.com/bdchain/Minerva).
   
   TODOs:
   
   - [x] Port to Drill 1.18.0
   - [ ] Add more tests
   - [ ] Support more formats 
       - [x] JSON
       - [ ] CSV
       - [ ] ...
   - [ ] Add writer support (`CREATE TABLE` statements)
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r473949613



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       I don't know how to produce a testcase that will cause it to fail. I tested in a two node cluster with queries that involves data from the HTTP storage plugin, the classpath plugin and this plugin, combines join, filter and sort operators and nested subqueries. If @vvysotskyi could provide a testcase that shows these dynamically added endpoints can be a problem, I can look into that and see what solutions we can find. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446247251



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    //FIXME why 100000 * size?
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    //FIXME what does this mean?
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {

Review comment:
       Changed in 2cca7fb.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r472198914



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       The problem is that unlike the other types of storage plugins, where the Drillbits are known at the time Drill starts, because they reside in the same cluster and are managed by a coordinator, the Drillbits in this plugin are remote IPFS peers which are associated with a particular query, thus can only be known when the user runs a query.
   
   The complete workflow of this plugin is:
   
   1. the user inputs an SQL statement that specifies an IPFS path to the target table of the query;
   2. this plugin resolves the path to an IPFS object and finds its "providers", i.e. IFPS nodes which store the target object, and filters out those which are running Drill (Drill-ready);
   3. these nodes are registered as Drillbit endpoints, and the query plan is sent to them;
   4. these nodes execute the query plan and return results.
   
   I made some slides to illustrate the basic idea: <https://www.slideshare.net/BowenDing4/minerva-ipfs-storage-plugin-for-ipfs>, starting on slide 10.
   
   I understand how this storage plugin works may break Drill's existing model, but I couldn't find a plugin that works in a similar way, and the internal workings of Drill is too complex to go through. Could you please be more specific about how this plugin can be incompatible with other queries?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470087869



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();

Review comment:
       Can we throw a `UserException` here as well?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444246636



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       Yes, adding names to the Jira and PR description sounds good to me. By the way, Jira has the `Assignee` filed. I've added your Jira account to the contributor role and assigned it's Jira to you.
   
   Glad to hear that such contributions are made as a research project.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446669892



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Consumer;
+
+/*
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  public class DHT {
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      BlockingQueue<CompletableFuture<Object>> results = new LinkedBlockingQueue<>();
+      executor.submit(() -> retrieveAndParseStream("dht/findpeer?arg=" + id, results));
+
+      try {
+        long stop = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(timeout);
+        while(System.currentTimeMillis() < stop) {
+          Map peer = (Map) results.poll(timeout, TimeUnit.SECONDS);
+          if ( peer != null ) {
+            if ( (int) peer.get("Type") == 2 ) {
+              return (List<String>)
+                  ((Map)
+                      ((List) peer.get("Responses")
+                      ).get(0)
+                  ).get("Addrs");
+            }
+            //else: response contains no Addrs, so ignore it.

Review comment:
       I think they are removed in ebc0dc6.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-650790085


   @cgivre it turned out what was blocking the tests was that the default number of providers in test config was too large, as a result IPFS could not find any other providers in time, thus the `TimeoutException`s. I wish the test logs had included full stack traces, which could have saved me hours looking into the Drill planner internals... 😓


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r472228486



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       @cgivre Indeed I hadn't thought about compatibility with other storage plugins. But a quick test on my machine shows there seems to be no problem doing a join on two datasets on ipfs and the `cp` store:
   
   ```
   SELECT `emplyee`.`full_name` 
   FROM cp.`employee.json` as `emplyee` 
   INNER JOIN ipfs.`/ipfs/QmcbeavnEofA6NjG7vkpe1yLJo6En6ML4JnDooDn1BbKmR#json` as `simple`
   ON `emplyee`.`first_name` = `simple`.`name`
   ```
   
   where the `QmcbeavnEofA6NjG7vkpe1yLJo6En6ML4JnDooDn1BbKmR` is the hash of the `simple.json` file from test resources, and can be added to IPFS by running:
   
   ```
   ipfs object patch set-data $(ipfs object new) < simple.json
   ```
   
   I don't know how to do more complex tests, though.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-677673312


   @dbw9580 
   What would happen in the following scenario.  Let's say you have user A who executes a query using IPFS and which spins up new Drillbits.  User B then decides to execute a query that does not use IPFS.  Is it possible that if these two queries are concurrent, could user B's query end up on the Drillbits for IPFS and then either not find data or cause some other problem?
   
   Alternatively, what would happen if user B executes a query while user A's IPFS queries are running.  What would happen if user A's query completes before user B?  Would it tear down the Drillbits and cause a crash?
   
   I'm asking because I really don't know here..  


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446669488



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")

Review comment:
       I don't why, but removing this annotation seems to make the tests hang forever.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446669408



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {
+            ipfsTimeoutDefaults.forEach(ipfsTimeouts::putIfAbsent);
+        } else {
+            ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.ipfsTimeouts = ipfsTimeouts;
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {
+        String host_port = String.format("%s:%d[%d,%s]", host, port, maxNodesPerLeaf, ipfsTimeouts);
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host_port == null) ? 0 : host_port.hashCode());
+        result = prime * result + ((formats == null) ? 0 : formats.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {
+            return true;
+        }
+        if (obj == null) {
+            return false;
+        }
+        if (getClass() != obj.getClass()) {

Review comment:
       Changed in 282a89d.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446667334



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {
+            ipfsTimeoutDefaults.forEach(ipfsTimeouts::putIfAbsent);
+        } else {
+            ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.ipfsTimeouts = ipfsTimeouts;
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {
+        String host_port = String.format("%s:%d[%d,%s]", host, port, maxNodesPerLeaf, ipfsTimeouts);
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host_port == null) ? 0 : host_port.hashCode());
+        result = prime * result + ((formats == null) ? 0 : formats.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {
+            return true;
+        }
+        if (obj == null) {
+            return false;
+        }
+        if (getClass() != obj.getClass()) {
+            return false;
+        }
+        IPFSStoragePluginConfig other = (IPFSStoragePluginConfig) obj;
+        if (formats == null) {
+            if (other.formats != null) {
+                return false;
+            }
+        } else if (!formats.equals(other.formats)) {
+            return false;
+        }
+        if (host == null) {

Review comment:
       What if both `host` and `other.host` is not `null`? The proposed change will return `false` early, but in fact we still need to test if they are equal.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432975423



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSJSONRecordReader.java
##########
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.core.JsonParseException;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.ops.FragmentContext;
+import org.apache.drill.exec.ops.OperatorContext;
+import org.apache.drill.exec.physical.impl.OutputMutator;
+import org.apache.drill.exec.store.AbstractRecordReader;
+import org.apache.drill.exec.store.easy.json.JsonProcessor;
+import org.apache.drill.exec.store.easy.json.reader.CountingJsonReader;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.exec.vector.BaseValueVector;
+import org.apache.drill.exec.vector.complex.fn.JsonReader;
+import org.apache.drill.exec.vector.complex.impl.VectorContainerWriter;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+public class IPFSJSONRecordReader extends AbstractRecordReader {
+  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSJSONRecordReader.class);
+
+  public static final long DEFAULT_ROWS_PER_BATCH = BaseValueVector.INITIAL_VALUE_ALLOCATION;
+
+  private FragmentContext fragmentContext;
+  private IPFSContext ipfsContext;
+  private String subScanSpec;
+  private List<SchemaPath> columnList;
+  private JsonProcessor jsonReader;
+  private InputStream stream;
+  private int recordCount;
+  private long runningRecordCount = 0;
+  private final boolean enableAllTextMode;
+  private final boolean enableNanInf;
+  private final boolean enableEscapeAnyChar;
+  private final boolean readNumbersAsDouble;
+  private final boolean unionEnabled;
+  private long parseErrorCount;
+  private final boolean skipMalformedJSONRecords;
+  private final boolean printSkippedMalformedJSONRecordLineNumber;
+  private JsonProcessor.ReadState write = null;
+  private VectorContainerWriter writer;
+
+  public IPFSJSONRecordReader(FragmentContext fragmentContext, IPFSContext ipfsContext, String scanSpec, List<SchemaPath> columns) {
+    this.fragmentContext = fragmentContext;
+    this.ipfsContext = ipfsContext;
+    this.subScanSpec = scanSpec;
+    this.columnList = columns;
+    setColumns(columns);
+    this.fragmentContext = fragmentContext;
+    // only enable all text mode if we aren't using embedded content mode.
+    this.enableAllTextMode = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR);
+    this.enableEscapeAnyChar = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ESCAPE_ANY_CHAR_VALIDATOR);
+    this.enableNanInf = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_NAN_INF_NUMBERS_VALIDATOR);
+    this.readNumbersAsDouble = fragmentContext.getOptions().getOption(ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR);
+    this.unionEnabled = fragmentContext.getOptions().getBoolean(ExecConstants.ENABLE_UNION_TYPE_KEY);
+    this.skipMalformedJSONRecords = fragmentContext.getOptions().getOption(ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR);
+    this.printSkippedMalformedJSONRecordLineNumber = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_PRINT_INVALID_RECORDS_LINE_NOS_FLAG_VALIDATOR);
+
+  }
+
+  @Override
+  public String toString() {
+    return super.toString()
+        + ", recordCount = " + recordCount
+        + ", parseErrorCount = " + parseErrorCount
+        + ", runningRecordCount = " + runningRecordCount + ", ...]";
+  }
+
+  @Override
+  public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {

Review comment:
       @dbw9580 
   Thanks for submitting this.  As a first step, take a look at the HTTP storage plugin.  Your implementation uses the older JSON reader, and the HTTP plugin demonstrates how to use the newer version as well as the EVF framework.  You'll see the code is much simpler.  
   
   https://github.com/apache/drill/blob/d16e3144c4b51dccd322639711ffc8706f4c2e13/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpBatchReader.java#L52-L95.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;
+  private IPFSPeer myself;
+  private IPFSStoragePluginConfig storagePluginConfig;
+  private IPFSStoragePlugin storagePlugin;
+  private LoadingCache<Multihash, IPFSPeer> ipfsPeerCache =
+      CacheBuilder.newBuilder()
+                  .maximumSize(1000)
+                  .refreshAfterWrite(10, TimeUnit.MINUTES)
+                  .build(new CacheLoader<Multihash, IPFSPeer>() {
+                    @Override
+                    public IPFSPeer load(Multihash key) {
+                      return new IPFSPeer(getIPFSHelper(), key);
+                    }
+                  });
+
+  public IPFSContext(IPFSStoragePluginConfig config, IPFSStoragePlugin plugin, IPFS client) throws IOException {
+    this.ipfsClient = client;
+    this.ipfsHelper = new IPFSHelper(client);
+    this.storagePlugin = plugin;
+    this.storagePluginConfig = config;
+
+    Map res = ipfsHelper.timedFailure(client::id, config.getIpfsTimeout(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String)res.get("ID"));
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(addr -> new MultiAddress(addr))
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this.ipfsHelper, myID, myAddrs);
+    this.ipfsHelper.setMyself(myself);
+  }
+
+
+  public IPFS getIPFSClient() {
+    return ipfsClient;
+  }
+
+  public IPFSHelper getIPFSHelper() {
+    return ipfsHelper;
+  }
+
+  public IPFSPeer getMyself() {
+    return myself;
+  }
+
+  public IPFSStoragePlugin getStoragePlugin() {
+    return storagePlugin;
+  }
+
+  public IPFSStoragePluginConfig getStoragePluginConfig() {
+    return storagePluginConfig;
+  }
+
+  public LoadingCache<Multihash, IPFSPeer> getIPFSPeerCache() {
+    return ipfsPeerCache;
+  }
+

Review comment:
       Nit: remove space.

##########
File path: contrib/storage-ipfs/pom.xml
##########
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>drill-contrib-parent</artifactId>
+        <groupId>org.apache.drill.contrib</groupId>
+        <version>1.18.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>drill-ipfs-storage</artifactId>
+    <name>contrib/ipfs-storage-plugin</name>
+    <version>0.1.0</version>
+    <properties>
+        <ipfs.TestSuite>**/IPFSTestSuit.class</ipfs.TestSuite>
+    </properties>
+
+    <repositories>

Review comment:
       Is this necessary?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;

Review comment:
       Please make as many instance variables `final` as possible. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")

Review comment:
       JsonProperty not needed here.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePlugin.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.ipfs.api.IPFS;
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.drill.common.JSONOptions;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.store.AbstractStoragePlugin;
+import org.apache.drill.exec.store.SchemaConfig;
+
+import java.io.IOException;
+import java.util.List;
+
+public class IPFSStoragePlugin extends AbstractStoragePlugin {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePlugin.class);
+
+  private final IPFSContext ipfsContext;
+  private final IPFSStoragePluginConfig pluginConfig;
+  private final IPFSSchemaFactory schemaFactory;
+  private final IPFS ipfsClient;
+
+  public IPFSStoragePlugin(IPFSStoragePluginConfig config, DrillbitContext context, String name) throws IOException {
+    super(context, name);
+    this.ipfsClient = new IPFS(config.getHost(), config.getPort());
+    this.ipfsContext = new IPFSContext(config, this, ipfsClient);
+    this.schemaFactory = new IPFSSchemaFactory(this.ipfsContext, name);
+    this.pluginConfig = config;
+  }
+
+  @Override
+  public boolean supportsRead() {
+    return true;
+  }
+
+  @Override
+  public boolean supportsWrite() {
+    return true;
+  }
+
+  @Override
+  public IPFSGroupScan getPhysicalScan(String userName, JSONOptions selection) throws IOException {
+    logger.debug("IPFSStoragePlugin before getPhysicalScan");
+    IPFSScanSpec spec = selection.getListWith(new ObjectMapper(), new TypeReference<IPFSScanSpec>() {});
+    logger.debug("IPFSStoragePlugin getPhysicalScan with selection {}", selection);
+    return new IPFSGroupScan(ipfsContext, spec, null);
+  }
+
+  @Override
+  public IPFSGroupScan getPhysicalScan(String userName, JSONOptions selection, List<SchemaPath> columns) throws IOException {
+    logger.debug("IPFSStoragePlugin before getPhysicalScan");
+    IPFSScanSpec spec = selection.getListWith(new ObjectMapper(), new TypeReference<IPFSScanSpec>() {});
+    logger.debug("IPFSStoragePlugin getPhysicalScan with selection {}, columns {}", selection, columns);
+    return new IPFSGroupScan(ipfsContext, spec, columns);
+  }
+
+  public IPFS getIPFSClient() {
+    return ipfsClient;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    schemaFactory.registerSchemas(schemaConfig, parent);
+  }
+
+  @Override
+  public IPFSStoragePluginConfig getConfig() {
+    return pluginConfig;
+  }
+
+  public IPFSContext getIPFSContext() {
+    return ipfsContext;
+  }
+

Review comment:
       Nit: remove space

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSScanSpec.class);
+
+  public enum Prefix {
+    @JsonProperty("ipfs")
+    IPFS("ipfs"),
+    @JsonProperty("ipns")
+    IPNS("ipns");
+
+    @JsonProperty("prefix")
+    private String name;
+    Prefix(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {

Review comment:
       For any serialized objects, please use the `PlanStringBuilder()` utility function. 
   
   https://github.com/apache/drill/blob/d16e3144c4b51dccd322639711ffc8706f4c2e13/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpScanSpec.java#L75-L83

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSScanSpec.class);

Review comment:
       Please make all `loggers` private static final and remove the full module name.   Here and elsewhere.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSJSONRecordReader.java
##########
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.core.JsonParseException;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.ops.FragmentContext;
+import org.apache.drill.exec.ops.OperatorContext;
+import org.apache.drill.exec.physical.impl.OutputMutator;
+import org.apache.drill.exec.store.AbstractRecordReader;
+import org.apache.drill.exec.store.easy.json.JsonProcessor;
+import org.apache.drill.exec.store.easy.json.reader.CountingJsonReader;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.exec.vector.BaseValueVector;
+import org.apache.drill.exec.vector.complex.fn.JsonReader;
+import org.apache.drill.exec.vector.complex.impl.VectorContainerWriter;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+public class IPFSJSONRecordReader extends AbstractRecordReader {
+  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSJSONRecordReader.class);
+
+  public static final long DEFAULT_ROWS_PER_BATCH = BaseValueVector.INITIAL_VALUE_ALLOCATION;
+
+  private FragmentContext fragmentContext;
+  private IPFSContext ipfsContext;
+  private String subScanSpec;
+  private List<SchemaPath> columnList;
+  private JsonProcessor jsonReader;
+  private InputStream stream;
+  private int recordCount;
+  private long runningRecordCount = 0;
+  private final boolean enableAllTextMode;
+  private final boolean enableNanInf;
+  private final boolean enableEscapeAnyChar;
+  private final boolean readNumbersAsDouble;
+  private final boolean unionEnabled;
+  private long parseErrorCount;
+  private final boolean skipMalformedJSONRecords;
+  private final boolean printSkippedMalformedJSONRecordLineNumber;
+  private JsonProcessor.ReadState write = null;
+  private VectorContainerWriter writer;
+
+  public IPFSJSONRecordReader(FragmentContext fragmentContext, IPFSContext ipfsContext, String scanSpec, List<SchemaPath> columns) {
+    this.fragmentContext = fragmentContext;
+    this.ipfsContext = ipfsContext;
+    this.subScanSpec = scanSpec;
+    this.columnList = columns;
+    setColumns(columns);
+    this.fragmentContext = fragmentContext;
+    // only enable all text mode if we aren't using embedded content mode.
+    this.enableAllTextMode = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR);
+    this.enableEscapeAnyChar = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ESCAPE_ANY_CHAR_VALIDATOR);
+    this.enableNanInf = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_NAN_INF_NUMBERS_VALIDATOR);
+    this.readNumbersAsDouble = fragmentContext.getOptions().getOption(ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR);
+    this.unionEnabled = fragmentContext.getOptions().getBoolean(ExecConstants.ENABLE_UNION_TYPE_KEY);
+    this.skipMalformedJSONRecords = fragmentContext.getOptions().getOption(ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR);
+    this.printSkippedMalformedJSONRecordLineNumber = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_PRINT_INVALID_RECORDS_LINE_NOS_FLAG_VALIDATOR);
+
+  }
+
+  @Override
+  public String toString() {
+    return super.toString()
+        + ", recordCount = " + recordCount
+        + ", parseErrorCount = " + parseErrorCount
+        + ", runningRecordCount = " + runningRecordCount + ", ...]";
+  }
+
+  @Override
+  public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {

Review comment:
       You'll also have to modify the `ScanBatchCreator` as well. 

##########
File path: contrib/storage-ipfs/README.md
##########
@@ -0,0 +1,182 @@
+# Drill Storage Plugin for IPFS
+
+[中文](README.zh.md)
+
+## Contents
+
+0. [Introduction](#Introduction)
+1. [Compile](#Compile)
+2. [Install](#Install)
+2. [Configuration](#Configuration)
+3. [Run](#Run)
+
+## Introduction
+
+Minerva is a storage plugin of Drill that connects IPFS's decentralized storage and Drill's flexible query engine. Any data file stored on IPFS can be easily accessed from Drill's query interface, just like a file stored on a local disk. Moreover, with Drill's capability of distributed execution, other instances who are also running Minerva can help accelerate the execution: the data stays where it was, and the queries go to the most suitable nodes which stores the data locally and from there the operations can be performed most efficiently. 
+
+Slides that explain our ideas and the technical details of Minerva: <https://www.slideshare.net/BowenDing4/minerva-ipfs-storage-plugin-for-ipfs>
+
+A live demo: <http://www.datahub.pub/> hosted on a private cluster of Minerva.
+
+Note that it's still in early stages of development and the overall stability and performance is not satisfactory. PRs are very much welcome!
+
+## Compile
+
+### Dependencies
+
+This project depends on forks of the following projects:
+
+* IPFS Java API: [java-ipfs-api](https://github.com/bdchain/java-ipfs-api)
+
+* Drill 1.16.0:[Drill-fork](https://github.com/bdchain/Drill-fork) (`1.16.0-fork` branch)
+
+Please clone and build these projects locally, or the compiler will complain about unknown symbols when you compile this project.
+
+### Compile under the Drill source tree
+
+Clone to the `contrib` directory in Drill source tree, e.g. `contrib/storage-ipfs`:
+```
+cd drill/contrib/
+git clone https://github.com/bdchain/Minerva.git storage-ipfs
+```
+
+Edit the parent POM of Drill contrib module (contrib/pom.xml), add this plugin under `<modules>` section:
+
+```
+<modules>
+    <module>storage-hbase</module>
+    <module>format-maprdb</module>
+    .....
+    <module>storage-ipfs</module>
+</modules>
+```
+
+Build from the root directory of Drill source tree:
+
+```
+mvn -T 2C clean install -DskipTests -Dcheckstyle.skip=true
+```
+
+The jars are in the `storage-ipfs/target` directory.
+
+## Install
+
+The executables and configurations are in `distribution/target/apache-drill-1.16.0`. Copy the entire directory to somewhere outside the source tree, and name it `drill-run` e.g., for testing later.
+
+Copy the `drill-ipfs-storage-{version}.jar` generated jar file to `drill-run/jars`.
+
+Copy `java-api-ipfs-v1.2.2.jar` which is IPFS's Java API, along with its dependencies provided as jar files:
+
+```
+cid.jar
+junit-4.12.jar
+multiaddr.jar
+multibase.jar
+multihash.jar
+hamcrest-core-1.3.jar
+```
+
+to `drill-run/jars/3rdparty`.
+
+Optionally, copy the configuration override file `storage-plugin-override.conf` to `drill-run/conf`, if you want Drill to auto configure and enable IPFS storage plugin at every (re)start.
+
+## Configuration
+
+1. Set Drill hostname to the IP address of the node to run Drill:
+    
+    Edit file `conf/drill-env.sh` and change the environment variable `DRILL_HOST_NAME` to the IP address of the node. Use private or global addresses, depending on whether you plan to run it on a cluster or the open Internet.
+
+2. Configure the IPFS storage plugin:
+    
+    If you are not using the configuration override file, you will have to manually configure and enable the plugin.
+    
+    Run Drill according to [Section Run](#Run) and go to the webui of Drill (can be found at <http://localhost:8047>). Under the Storage tab, create a new storage plugin named `ipfs` and click the Create button.
+    
+    Copy and paste the default configuration of the IPFS storage plugin located at `storage-ipfs/src/resources/bootstrap-storage-plugins.json`:
+    
+    ```
+    ipfs : {
+        "type":"ipfs",
+        "host": "127.0.0.1",
+        "port": 5001,
+        "max-nodes-per-leaf": 3,
+        "ipfs-timeouts": {
+          "find-provider": 4,
+          "find-peer-info": 4,
+          "fetch-data": 5
+        },
+        "groupscan-worker-threads": 50,
+        "formats": null,
+        "enabled": true
+    }
+    ```
+    
+    where 
+    
+    `host` and `port` are the host and API port where your IPFS daemon will be listening. Change it so that it matches the configuration of your IPFS instance.
+
+    `max-nodes-per-leaf` controls how many provider nodes will be considered when the query is being planned. A larger value increases the parallelization width but typically takes longer to find enough providers from DHT resolution. A smaller value does the opposite.
+    
+    `ipfs-timeouts` set the maximum amount of time in seconds for various time consuming operations: `find-provider` is the time allowed to do DHT queries to find providers, `find-peer-info` is the time allowed to resolve the network addresses of the providers and `fetch-data` is the time the actual transmission is allowed to take. 
+    
+    `groupscan-worker-threads` limits the number of worker threads when the planner communicate with IPFS daemon to resolve providers and peer info.
+    
+    `formats` specifies the formats of the files. It is unimplemented for now and does nothing.
+    
+    Click the Update button after finishing editing. You should see the IPFS storage plugin is registered with Drill and you can enable it with the Enable button.
+    
+3. Configure IPFS
+
+    Start the IPFS daemon first. 
+    
+    Set a Drill-ready flag to the node:
+    
+    ```
+    ipfs name publish $(\
+      ipfs object patch add-link $(ipfs object new) "drill-ready" $(\
+        printf "1" | ipfs object patch set-data $(ipfs object new)\
+      )\
+    )
+    ```
+    
+    This flag indicates that an IPFS node is also capable of handling Drill quries and the planner will consider it when scheduling a query to execute distributedly. A node without this flag will be ignored.
+    
+
+## Run
+
+### Embedded mode
+
+Start IPFS daemon:
+
+```
+ipfs daemon &>/dev/null &
+```
+
+start drill-embedded:
+
+```
+drill-run/bin/drill-embedded
+```
+
+You can now execute queries via the command line as well as the web interface.
+
+### As a background service
+
+You can run drill-embedded as a background process without controlling a terminal. This is done with the help of tmux, which is available in many distributions of Linux.
+
+Edit the systemd service file `drill-embedded.service`, so that the environment variable `DRILL_HOME` pointes to where Drill is installed:
+```
+Environment="DRILL_HOME=/home/drill/apache-drill-1.16.0"
+```
+Copy the service file to systemd's configuration directory, e.g. `/usr/lib/systemd/system`:
+```
+cp drill-embedded.service /usr/lib/systemd/system
+```
+Reload the systemd daemon:
+```
+systemd daemon-reload
+```
+Start the service:
+```
+systemd start drill-embedded.service
+```

Review comment:
       Please create a separate JIRA to add documentation to the Drill website.  That does not have to be part of this PR, but otherwise nobody will know about this. ;-).

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;
+  private IPFSPeer myself;
+  private IPFSStoragePluginConfig storagePluginConfig;
+  private IPFSStoragePlugin storagePlugin;
+  private LoadingCache<Multihash, IPFSPeer> ipfsPeerCache =
+      CacheBuilder.newBuilder()
+                  .maximumSize(1000)
+                  .refreshAfterWrite(10, TimeUnit.MINUTES)
+                  .build(new CacheLoader<Multihash, IPFSPeer>() {
+                    @Override
+                    public IPFSPeer load(Multihash key) {
+                      return new IPFSPeer(getIPFSHelper(), key);
+                    }
+                  });
+
+  public IPFSContext(IPFSStoragePluginConfig config, IPFSStoragePlugin plugin, IPFS client) throws IOException {
+    this.ipfsClient = client;
+    this.ipfsHelper = new IPFSHelper(client);
+    this.storagePlugin = plugin;
+    this.storagePluginConfig = config;
+
+    Map res = ipfsHelper.timedFailure(client::id, config.getIpfsTimeout(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String)res.get("ID"));
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(addr -> new MultiAddress(addr))
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this.ipfsHelper, myID, myAddrs);
+    this.ipfsHelper.setMyself(myself);
+  }
+

Review comment:
       Nit: extra space.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePlugin.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.ipfs.api.IPFS;
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.drill.common.JSONOptions;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.store.AbstractStoragePlugin;
+import org.apache.drill.exec.store.SchemaConfig;
+
+import java.io.IOException;
+import java.util.List;
+
+public class IPFSStoragePlugin extends AbstractStoragePlugin {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePlugin.class);
+
+  private final IPFSContext ipfsContext;
+  private final IPFSStoragePluginConfig pluginConfig;
+  private final IPFSSchemaFactory schemaFactory;
+  private final IPFS ipfsClient;
+
+  public IPFSStoragePlugin(IPFSStoragePluginConfig config, DrillbitContext context, String name) throws IOException {
+    super(context, name);
+    this.ipfsClient = new IPFS(config.getHost(), config.getPort());
+    this.ipfsContext = new IPFSContext(config, this, ipfsClient);
+    this.schemaFactory = new IPFSSchemaFactory(this.ipfsContext, name);
+    this.pluginConfig = config;
+  }
+
+  @Override
+  public boolean supportsRead() {
+    return true;
+  }
+
+  @Override
+  public boolean supportsWrite() {
+    return true;
+  }
+
+  @Override
+  public IPFSGroupScan getPhysicalScan(String userName, JSONOptions selection) throws IOException {

Review comment:
       I'm not sure that is really necessary to have duplicates of this method. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {

Review comment:
       Actually, upon thinking about this, this is a bit more problematic.  You really should only include config options that are your standard types like ints, Strings etc.  It is possible to include arrays and more complex objects, but they all need to ultimately break down into collections of primitives and/or Strings. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;

Review comment:
       This actually needs to be in the protobuf file.  Don't worry about this until we're ready to commit, but there's a procedure where you add it to the `UserBitShared.proto`, then build the protobufs and finally the native c version.  I'd recommend waiting until everything is done before worrying about that. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {

Review comment:
       I suspect the reason Jackson won't serialize the `ipfsTimeouts` map is that `IPFSTimeOut` is not serializable.  The fix for this is to put a `JacksonInject` annotation over this variable, then it won't get serialized.  
   
   Alternatively, if you can create a wrapper class that is serializable and use that in the Map, that should work.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r475253606



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       @dbw9580, the case I mentioned may be reproduced in the cluster with several nodes when the endpoint for this plugin is registered for the node with no running drillbit that belongs to the current Drill cluster. In this case, the endpoint for that node will be registered. Assume at this time, another query is submitted. Drill will try to send the plan fragment to this node, and this will cause problems since actually Drillbit is not running there.
   Please take a look at the logic in `BlockMapBuilder`, where it decides which drillbit will execute specific `CompleteWork` instance.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r471131616



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static final int IPFS_SUB_SCAN_VALUE = 19155;

Review comment:
       I think it's time to update the protobufs to include this value.  
   You'll need to add the `IPFS_SUB_SCAN_VALUE` here:
   https://github.com/apache/drill/blob/0726b83d9347cbb8bd1bc64a8d10c12c1125549a/protocol/src/main/protobuf/UserBitShared.proto#L383-L385
   
   Then update the protobufs.  You can find the instructions here:
   -- https://github.com/apache/drill/tree/master/protocol
   and here for the native client.
   -- https://github.com/apache/drill/tree/master/contrib/native/client




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674047192


   > @dbw9580 I believe Drill does support connections from IPv6 sockets. There was a recent PR for this in fact: (#1857) but I'm not sure if that is directly relevant.
   > Were you able to get it working?
   
   I don't see Drill binding to any IPv6 address in `ss -6ltnp`. I blocked IPv6 addresses in 9494a30 and the tests are now passing (most of the time, due to https://github.com/apache/drill/pull/2084#issuecomment-674045895).


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674544534


   > 
   > Hi @dbw9580
   > Thanks for these updates. I didn't have any issues running your unit tests before this. However, I took a look at the Maven docs, and I'm wondering if you can specify the number of forks directly in the `pom.xml` file. [1](https://maven.apache.org/surefire/maven-surefire-plugin/examples/fork-options-and-parallel-execution.html)
   
   Thanks!! 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674963101


   @cgivre sure, but I have to do these tomorrow (it's now midnight in my timezone). And maybe allow some time for the IPFS API repo to release an official version: https://github.com/ipfs-shipyard/java-ipfs-http-client/pull/172#issuecomment-674938957 ?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432973228



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSPeer.java
##########
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+
+public class IPFSPeer {
+  private IPFSHelper helper;
+
+  private Multihash id;
+  private List<MultiAddress> addrs;
+  private boolean isDrillReady;
+  private boolean isDrillReadyChecked = false;
+  private Optional<String> drillbitAddress = Optional.empty();
+  private boolean drillbitAddressChecked = false;
+
+
+  public IPFSPeer(IPFSHelper helper, Multihash id) {
+    this.helper = helper;
+    this.id = id;
+  }
+
+  IPFSPeer(IPFSHelper helper, Multihash id, List<MultiAddress> addrs) {
+    this.helper = helper;
+    this.id = id;
+    this.addrs = addrs;
+    this.isDrillReady = helper.isDrillReady(id);
+    this.isDrillReadyChecked = true;
+    this.drillbitAddress = IPFSHelper.pickPeerHost(addrs);
+    this.drillbitAddressChecked = true;
+  }
+
+  public boolean isDrillReady() {
+    if (!isDrillReadyChecked) {
+      isDrillReady = helper.isDrillReady(id);
+      isDrillReadyChecked = true;
+    }
+    return isDrillReady;
+  }
+
+  public boolean hasDrillbitAddress() {
+    findDrillbitAddress();
+    return drillbitAddress.isPresent();
+  }
+
+  public Optional<String> getDrillbitAddress() {
+    findDrillbitAddress();
+    return drillbitAddress;
+  }
+
+  public List<MultiAddress> getMultiAddresses() {
+    findDrillbitAddress();
+    return addrs;
+  }
+
+  public Multihash getId() {
+    return id;
+  }
+
+
+  private void findDrillbitAddress() {
+    if (!drillbitAddressChecked) {

Review comment:
       ```java
   if (drillbitAddressChecked) {
     return;
   }
   
   try {
   ...
   } catch (IOException e) {
   ...
   }
   drillbitAddressChecked = true;
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-647858839


   > @dbw9580 
   > This is definitely making progress.  Will testing require an IPFS installation?
   > 
   
   Yes, a running IPFS daemon is required.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-646131568


    > break out CSV and the writer support as separate PRs. 
   
   Great. Will do. 
      
   > I attempted to use Drill's built in CSV reader but I would have had to do a lot of work on the CSV reader to get it to work... So... just used this simple version.  
   
   Ok. The text reader module from the easy format plugin framework looks good, but I couldn't figure out a way to reuse that part of code in this plugin. Copy-pasting code is not accepted, I assume?
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r467667133



##########
File path: contrib/storage-ipfs/src/test/java/org/apache/drill/exec/store/ipfs/TestIPFSGroupScan.java
##########
@@ -0,0 +1,162 @@
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.categories.IPFSStorageTest;
+import org.apache.drill.categories.SlowTest;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.shaded.guava.com.google.common.io.Resources;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.*;

Review comment:
       The star import is a check-style violation.  

##########
File path: contrib/storage-ipfs/src/test/java/org/apache/drill/exec/store/ipfs/IPFSTestSuit.java
##########
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.drill.categories.IPFSStorageTest;
+import org.apache.drill.categories.SlowTest;
+import org.apache.drill.shaded.guava.com.google.common.io.Resources;
+import org.junit.BeforeClass;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+
+@RunWith(Suite.class)
+@Suite.SuiteClasses({TestIPFSQueries.class, TestIPFSGroupScan.class})

Review comment:
       This is missing the scan spec test.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       It looks as if https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157 has been closed.  Can you remove this restriction?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));

Review comment:
       Here, if `worklist` is null, `worklist.size()` will throw a NPE.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)

Review comment:
       I would set these as constants at the top of the class.  Then once DRILL-7754 is committed, it's easier to fix this. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  public class DHT {
+    /**
+     * Find internet addresses of a given peer.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findpeer">dht/findpeer in IPFS doc</a>.
+     * @param id the id of the peer to query
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multiaddresses of the peer
+     */
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>();
+      timeLimitedExec(
+          "name/resolve?arg=" + id,
+          timeout,
+          res -> {
+            Map peer = (Map) res;

Review comment:
       Can we specify the parameter?  IE
   ```
   Map<String>
   ``` 
   or whatever the case may be?  My IDE isn't liking this.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;

Review comment:
       These three variables can be `final`. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),

Review comment:
       `getPlugin()` is deprecated.  Use `resolve()` instead.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(addr -> new MultiAddress(addr))
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /*
+   * DRILL-7753: implement a more advanced algorithm that picks optimal addresses. Maybe check reachability, latency
+   * and bandwidth?
+   */
+  /**
+   * Choose a peer's network address from its advertised Multiaddresses.
+   * Prefer globally routable address over local addresses.
+   * @param peerAddrs Multiaddresses obtained from IPFS.DHT.findprovs
+   * @return network address
+   */
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException e) {
+        continue;

Review comment:
       `Continue` is not necessary here.   Should we log this?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {

Review comment:
       Is this correct?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {
+      return 0;
+    }
+
+    @Override
+    public String toString() {
+      return "IPFSWork [root = " + partialRoot.toString() + "]";
+    }
+  }
+
+  //DRILL-7756: detect and warn about loops/recursions in case of a malformed tree
+  static class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+    private final Multihash hash;
+    private final boolean isProvider;
+    private final Map<Multihash, String> ret = new LinkedHashMap<>();
+    private final IPFSPeer myself;
+    private final IPFSHelper helper;
+    private final LoadingCache<Multihash, IPFSPeer> peerCache;
+
+    public IPFSTreeFlattener(Multihash hash, boolean isProvider, IPFSContext context) {
+      this(
+        hash,
+        isProvider,
+        context.getMyself(),
+        context.getIPFSHelper(),
+        context.getIPFSPeerCache()
+      );
+    }
+
+    IPFSTreeFlattener(Multihash hash, boolean isProvider, IPFSPeer myself, IPFSHelper ipfsHelper, LoadingCache<Multihash, IPFSPeer> peerCache) {
+      this.hash = hash;
+      this.isProvider = isProvider;
+      this.myself = myself;
+      this.helper = ipfsHelper;
+      this.peerCache = peerCache;
+    }
+
+    public IPFSTreeFlattener(IPFSTreeFlattener reference, Multihash hash, boolean isProvider) {
+      this(hash, isProvider, reference.myself, reference.helper, reference.peerCache);
+    }
+
+    @Override
+    public Map<Multihash, String> compute() {
+      try {
+        if (isProvider) {
+          IPFSPeer peer = peerCache.getUnchecked(hash);
+          ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);

Review comment:
       Recommend a `isPresent()` check here to prevent errors.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();

Review comment:
       These three variables can be `final`.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),

Review comment:
       Also, this method does not throw an `IOException`.  You can remove that if you want.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432971981



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;
+  private IPFSPeer myself;
+  private IPFSStoragePluginConfig storagePluginConfig;
+  private IPFSStoragePlugin storagePlugin;
+  private LoadingCache<Multihash, IPFSPeer> ipfsPeerCache =

Review comment:
       Why not putting this in constructor?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r469960924



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  public class DHT {
+    /**
+     * Find internet addresses of a given peer.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findpeer">dht/findpeer in IPFS doc</a>.
+     * @param id the id of the peer to query
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multiaddresses of the peer
+     */
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>();
+      timeLimitedExec(
+          "name/resolve?arg=" + id,
+          timeout,
+          res -> {
+            Map peer = (Map) res;

Review comment:
       I think it's unnecessary to specify all the type parameters. These `Map`s are JSON responses from the IPFS daemon, and can be deeply nested. It would be best handled by a library to properly define the types and structures of these responses, e.g. via DAOs, but the `java-ipfs-http-client` library does not make such efforts. If we must specify the type parameters, most of the type parameters are just `Map<String, Object>`, which is mostly meaningless.
   The IPFS HTTP API docs have proper specification for the types and field names present in every API responses, so I guess we can count on it when doing type casting. Take `dht.findprovs` as an example:  https://docs.ipfs.io/reference/http/api/#response-34




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673552669


   @dbw9580 
   I redownloaded and it built for me.  Please disregard previous comments.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r471589662



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       Drill should obtain all the info about the source of the data **only** from the query plan and minor fragments. Please take a look at existing storage plugins, or even file format plugins to see how it is implemented there.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r468469590



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       Good news, it's trivial to revert to Java 8: https://github.com/ipfs-shipyard/java-ipfs-http-client/pull/172.
   Let's hope it gets merged soon.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446224728



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应

Review comment:
       Removed in b6fcc0df.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {

Review comment:
       Removed in b6fcc0df




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470088211



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /*
+   * DRILL-7753: implement a more advanced algorithm that picks optimal addresses. Maybe check reachability, latency
+   * and bandwidth?
+   */
+  /**
+   * Choose a peer's network address from its advertised Multiaddresses.
+   * Prefer globally routable address over local addresses.
+   * @param peerAddrs Multiaddresses obtained from IPFS.DHT.findprovs
+   * @return network address
+   */
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException ignored) {

Review comment:
       If this is ignored, can/should we log this event?  




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444236047



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       @dbw9580
   Are you just looking for a permanent link for attribution?  If you go with the aforementioned approach, the issue is that we squash all commits into one commit prior to committing.  I suspect that your name change would be lost in the shuffle. 
   
   Another option might be to put your names in both the JIRA and the pull request description.  You could also include a comment somewhere in the code pointing to the JIRA.  In publications I would assume you could then refer to the JIRA or pull request link.
   
   @vvysotskyi would that be acceptable?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444215763



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       @vvysotskyi We don't intend to keep our names in an contributors list; we just need a proof that this plugin, at a certain state, was contributed by us. In fact, we started this as a research project, and I need such a proof to include in my master's thesis to show these works were really done by me. 
   
   I understand that Git history is the best and reliable way to verify authorship; the problem is Git and Github could be too complicated for someone who is not familiar with Open Source software development, in contrast to how people give credits by simply citing literature in research papers.
   
   If possible, I would like to add a commit just before merge, which adds a authorship statement in `README`, and then another commit to delete it. This way, I can use that commit as a proof. How does this sound to you?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-636512388


   Added small refactoring ideas. Did not check implementation details


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443750430



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       We would like to retain a statement that reflects the plugin is contributed by these authors. Where would be a more appropriate place to put such a statement? Maybe `readme.md`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452320145



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {
+    this.context = context;
+    this.schemaName = name;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    logger.debug("registerSchemas {}", schemaName);
+    IPFSTables schema = new IPFSTables(schemaName);
+    SchemaPlus hPlus = parent.add(schemaName, schema);
+    schema.setHolder(hPlus);
+  }
+
+  class IPFSTables extends AbstractSchema {
+    private Set<String> tableNames = Sets.newHashSet();
+    private final ConcurrentMap<String, Table> tables = new ConcurrentSkipListMap<>(String::compareToIgnoreCase);
+    public IPFSTables (String name) {
+      super(ImmutableList.<String>of(), name);
+      tableNames.add(name);
+    }
+
+    public void setHolder(SchemaPlus pulsOfThis) {
+    }
+
+    @Override
+    public String getTypeName() {
+      return IPFSStoragePluginConfig.NAME;
+    }
+
+    @Override
+    public Set<String> getTableNames() {
+      return Collections.emptySet();
+    }
+
+    @Override
+    public Table getTable(String tableName) {
+      //TODO: better handling of table names

Review comment:
       This is now DRILL-7766.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {
+    this.context = context;
+    this.schemaName = name;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    logger.debug("registerSchemas {}", schemaName);
+    IPFSTables schema = new IPFSTables(schemaName);
+    SchemaPlus hPlus = parent.add(schemaName, schema);
+    schema.setHolder(hPlus);
+  }
+
+  class IPFSTables extends AbstractSchema {
+    private Set<String> tableNames = Sets.newHashSet();
+    private final ConcurrentMap<String, Table> tables = new ConcurrentSkipListMap<>(String::compareToIgnoreCase);
+    public IPFSTables (String name) {
+      super(ImmutableList.<String>of(), name);
+      tableNames.add(name);
+    }
+
+    public void setHolder(SchemaPlus pulsOfThis) {

Review comment:
       Fixed in 0f9c2db.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-650577819


   @cgivre I've added more tests. The tests are not passing, something about `Error while applying rule DrillScanRule`. However, I was able to successfully execute the test queries through Drill web interface. I don't know how to fix these tests?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446250297



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       > adding names to the Jira and PR description
   Yeah we can accept that. Edited the PR description.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       > adding names to the Jira and PR description
   
   Yeah we can accept that. Edited the PR description.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre edited a comment on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre edited a comment on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674957594


   @dbw9580 
   This is looking pretty good. I'm going to do a final check this evening or tomorrow, but can you please:
   
   1.  Squash all commits and add message of `DRILL-7745: Add storage plugin for IPFS` as the commit message
   2.  Go through and do a final code hygiene check (make sure there are no extra spaces, commented out blocks etc). Drill does have a code formatter[1] and just verify that the code complies with the coding standards for spacing and all that.  (I didn't see anything jump out at me, but it always helps to double check)
   3.  Please create a JIRA to add this to the public documentation.  You're welcome to actually add the documentation as well, but for now, let's just make sure we have a JIRA on file to actually add the docs. 
   
   Thanks!
   
   [1]: https://drill.apache.org/docs/apache-drill-contribution-guidelines/
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443757342



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;

Review comment:
       Fixed in 48f3b80




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 edited a comment on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 edited a comment on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-650577819


   @cgivre I've added more tests. The tests are not passing, something about `Error while applying rule DrillScanRule`. However, I was able to successfully execute the test queries through Drill web interface. I don't know how to fix these tests?
   
   Edit: attach log file.
   [org.apache.drill.exec.store.ipfs.TestIPFSQueries.txt](https://github.com/apache/drill/files/4840854/org.apache.drill.exec.store.ipfs.TestIPFSQueries.txt)
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r472234320



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       @dbw9580,
   Perhaps try some variations like an aggregate query, or a query that combines multiple data source AND has subqueries.  See if you can break it.  ;-)
   
   My hunch is that there we can find a way to resolve @vvysotskyi's concerns.  This is a new (and interesting) use case, we should find a way to include it. 
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446221154



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree

Review comment:
       Created JIRA tickets in b6fcc0d.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444041489



##########
File path: contrib/storage-ipfs/README.zh.md
##########
@@ -0,0 +1,184 @@
+# Drill Storage Plugin for IPFS

Review comment:
       But for this case, documentation outside the project also may be outdated. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470081506



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  /**
+   * As defined in https://github.com/libp2p/go-libp2p-core/blob/b77fd280f2bfcce22f10a000e8e1d9ec53c47049/routing/query.go#L16
+   */
+  public enum DHTQueryEventType {
+    // Sending a query to a peer.
+    SendingQuery,
+    // Got a response from a peer.
+    PeerResponse,
+    // Found a "closest" peer (not currently used).
+    FinalPeer,
+    // Got an error when querying.
+    QueryError,
+    // Found a provider.
+    Provider,
+    // Found a value.
+    Value,
+    // Adding a peer to the query.
+    AddingPeer,
+    // Dialing a peer.
+    DialingPeer;
+  }
+
+  public class DHT {
+    /**
+     * Find internet addresses of a given peer.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findpeer">dht/findpeer in IPFS doc</a>.
+     * @param id the id of the peer to query
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multiaddresses of the peer
+     */
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>();
+      timeLimitedExec(
+          "name/resolve?arg=" + id,
+          timeout,
+          res -> {
+            Map peer = (Map) res;
+            if (peer == null) {
+              return false;
+            }
+            if ( (int) peer.get("Type") != DHTQueryEventType.FinalPeer.ordinal() ) {
+              return false;
+            }
+            List<Map> responses = (List<Map>) peer.get("Responses");
+            if (responses == null || responses.size() == 0) {
+              return false;
+            }
+            // FinalPeer responses have exactly one response
+            Map<String, List<String>> response = responses.get(0);
+            if (response == null) {
+              return false;
+            }
+            List<String> addrs = response.get("Addrs");
+
+            ret.set(addrs);
+            return true;
+          },
+          err -> {
+            if (!(err instanceof TimeoutException)) {
+              throw new RuntimeException(err);
+            }
+          },
+          executor
+      );
+      if (ret.get().size() > 0) {
+        return ret.get();
+      } else {
+        return Collections.emptyList();
+      }
+    }
+
+    /**
+     * Find providers of a given CID.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findprovs">dht/findprovs in IPFS doc</a>.
+     * @param id the CID of the IPFS object
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multihash of providers of the object
+     */
+    public List<String> findprovsListTimeout(Multihash id, int maxPeers, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>(new ArrayList<>());
+      timeLimitedExec(
+          "dht/findprovs?arg=" + id + "&n=" + maxPeers,
+          timeout,
+          res -> {
+            Map peer = (Map) res;
+            if ( peer == null ) {
+              return false;
+            }
+            if ( (int) peer.get("Type") != DHTQueryEventType.Provider.ordinal() ) {
+              return false;
+            }
+            List<Map> responses = (List<Map>) peer.get("Responses");
+            if (responses == null || responses.size() == 0) {
+              return false;
+            }
+            // One Provider message contains only one provider
+            Map<String, String> response = responses.get(0);
+            if (response == null) {
+              return false;
+            }
+            String providerID = response.get("ID");
+
+            ret.get().add(providerID);
+            return ret.get().size() >= maxPeers;
+          },
+          err -> {
+            if (!(err instanceof TimeoutException)) {
+              throw new RuntimeException(err);
+            }
+          },
+          executor
+      );
+      if (ret.get().size() > 0) {
+        return ret.get();
+      } else {
+        return Collections.emptyList();
+      }
+    }
+  }
+
+  public class Name {
+    /**
+     * Resolve a IPNS name.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-name-resolve">name/resolve in IPFS doc</a>.
+     * @param hash the IPNS name to resolve
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return a Multihash of resolved name
+     */
+    public Optional<String> resolve(Multihash hash, int timeout, ExecutorService executor) {
+      AtomicReference<String> ret = new AtomicReference<>();
+      timeLimitedExec(
+        "name/resolve?arg=" + hash,
+        timeout,
+        res -> {
+          Map peer = (Map) res;
+          if (peer != null) {
+            ret.set((String) peer.get(("Path")));
+            return true;
+          }
+          return false;
+        },
+        err -> {
+          if (!(err instanceof TimeoutException)) {
+            throw new RuntimeException(err);
+          }
+        },
+        executor
+      );
+      return Optional.ofNullable(ret.get());
+    }
+  }
+
+  private void timeLimitedExec(String path, int timeout, Predicate<Object> processor, Consumer<Exception> error,
+                               ExecutorService executor) {
+    CompletableFuture<Void> f = CompletableFuture.runAsync(
+      ()-> getObjectStream(path, processor, error),
+      executor
+    );
+    try {
+      f.get(timeout, TimeUnit.SECONDS);
+    } catch (TimeoutException | ExecutionException | InterruptedException e) {
+      f.cancel(true);
+      error.accept(e);
+    }
+  }
+
+  private void getObjectStream(String path, Predicate<Object> processor, Consumer<Exception> error) {
+    byte LINE_FEED = (byte)10;
+
+    try {
+      InputStream in = getStream(path);
+      ByteArrayOutputStream resp = new ByteArrayOutputStream();
+
+      byte[] buf = new byte[4096];
+      int r;
+      while ((r = in.read(buf)) >= 0) {
+        resp.write(buf, 0, r);
+        if (buf[r - 1] == LINE_FEED) {
+          try {
+            boolean done = processor.test(JSONParser.parse(resp.toString()));
+            if (done) {
+              break;
+            }
+            resp.reset();
+          } catch (IllegalStateException e) {
+            in.close();
+            resp.close();
+            error.accept(e);
+          }
+        }
+      }
+      in.close();
+      resp.close();
+    } catch (IOException e) {
+      error.accept(e);
+    }
+  }
+
+  private InputStream getStream(String path) throws IOException {
+    URL target = new URL(protocol, host, port, version + path);
+    HttpURLConnection conn = configureConnection(target, "POST", readTimeout);
+    return conn.getInputStream();
+  }
+
+  private static HttpURLConnection configureConnection(URL target, String method, int timeout) throws IOException {

Review comment:
       I'm getting an IDE warning here about `method` always being `POST`.  Can we make this a local variable?  Will it ever be `GET` or something else?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470086686



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    List<Multihash> scanSpecList = Lists.newArrayList();
+    if (workList != null) {
+      logger.debug("workList.size(): {}", workList.size());
+
+      for (IPFSWork work : workList) {
+        scanSpecList.add(work.getPartialRootHash());
+      }
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private static class IPFSWork implements CompleteWork {
+    private final EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private final Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {
+      return 0;
+    }
+
+    @Override
+    public String toString() {
+      return "IPFSWork [root = " + partialRoot.toString() + "]";
+    }
+  }
+
+  //DRILL-7756: detect and warn about loops/recursions in case of a malformed tree
+  static class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+    private final Multihash hash;
+    private final boolean isProvider;
+    private final Map<Multihash, String> ret = new LinkedHashMap<>();
+    private final IPFSPeer myself;
+    private final IPFSHelper helper;
+    private final LoadingCache<Multihash, IPFSPeer> peerCache;
+
+    public IPFSTreeFlattener(Multihash hash, boolean isProvider, IPFSContext context) {
+      this(
+        hash,
+        isProvider,
+        context.getMyself(),
+        context.getIPFSHelper(),
+        context.getIPFSPeerCache()
+      );
+    }
+
+    IPFSTreeFlattener(Multihash hash, boolean isProvider, IPFSPeer myself, IPFSHelper ipfsHelper, LoadingCache<Multihash, IPFSPeer> peerCache) {
+      this.hash = hash;
+      this.isProvider = isProvider;
+      this.myself = myself;
+      this.helper = ipfsHelper;
+      this.peerCache = peerCache;
+    }
+
+    public IPFSTreeFlattener(IPFSTreeFlattener reference, Multihash hash, boolean isProvider) {
+      this(hash, isProvider, reference.myself, reference.helper, reference.peerCache);
+    }
+
+    @Override
+    public Map<Multihash, String> compute() {
+      try {
+        if (isProvider) {
+          IPFSPeer peer = peerCache.getUnchecked(hash);
+          ret.put(hash, peer.getDrillbitAddress().orElse(null));
+          return ret;
+        }
+
+        MerkleNode metaOrSimpleNode = helper.getObjectLinksTimeout(hash);
+        if (metaOrSimpleNode.links.size() > 0) {
+          logger.debug("{} is a meta node", hash);
+          //DRILL-7755: do something useful with leaf size, e.g. hint Drill about operation costs
+          List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+          ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+          for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+            builder.add(new IPFSTreeFlattener(this, intermediate, false));
+          }
+          ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+          subtasks.forEach(IPFSTreeFlattener::fork);
+
+          IPFSTreeFlattener first = new IPFSTreeFlattener(this, intermediates.get(0), false);
+          ret.putAll(first.compute());
+          subtasks.reverse().forEach(
+              subtask -> ret.putAll(subtask.join())
+          );
+        } else {
+          logger.debug("{} is a simple node", hash);
+          List<IPFSPeer> providers = helper.findprovsTimeout(hash).stream()
+              .map(peerCache::getUnchecked)
+              .collect(Collectors.toList());
+          providers = providers.stream()
+              .filter(IPFSPeer::isDrillReady)
+              .collect(Collectors.toList());
+          if (providers.size() < 1) {
+            logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+            providers.add(myself);
+          }
+
+          logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+          ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+          for (IPFSPeer provider : providers.subList(1, providers.size())) {
+            builder.add(new IPFSTreeFlattener(this, provider.getId(), true));
+          }
+          ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+          subtasks.forEach(IPFSTreeFlattener::fork);
+
+          List<String> possibleAddrs = new ArrayList<>();
+          Multihash firstProvider = providers.get(0).getId();
+          IPFSTreeFlattener firstTask = new IPFSTreeFlattener(this, firstProvider, true);
+          String firstAddr = firstTask.compute().get(firstProvider);
+          if (firstAddr != null) {
+            possibleAddrs.add(firstAddr);
+          }
+
+          subtasks.reverse().forEach(
+              subtask -> {
+                String addr = subtask.join().get(subtask.hash);
+                if (addr != null) {
+                  possibleAddrs.add(addr);
+                }
+              }
+          );
+
+          if (possibleAddrs.size() < 1) {
+            logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+            throw new RuntimeException("No address found for any provider for leaf " + hash);
+          } else {
+            //DRILL-7753: better peer selection algorithm
+            Random random = new Random();
+            String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+            ret.clear();
+            ret.put(hash, chosenAddr);
+            logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+          }
+        }
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+      return ret;
+    }
+  }
+

Review comment:
       Nit: Remove empty line.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446669668



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {

Review comment:
       Hmm, it seems that this comment was made very early in development, and the issue it describes no longer exists. I deleted the comment in 282a89d.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443753286



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isLoopbackAddress()) {
+          continue;
+        }
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          //FIXME we don't know which local address can be reached; maybe check with InetAddress.isReachable?
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException e) {
+        continue;
+      }
+    }
+
+    return Optional.ofNullable(localAddr);
+  }
+
+  public Optional<String> getPeerDrillHostname(Multihash peerId) {
+    return getPeerData(peerId, "drill-hostname").map(Strings::fromByteArray);
+  }
+
+  public boolean isDrillReady(Multihash peerId) {
+    try {
+      return getPeerData(peerId, "drill-ready").isPresent();
+    } catch (RuntimeException e) {
+      return false;
+    }
+  }
+
+  public Optional<Multihash> getIPNSDataHash(Multihash peerId) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()
+        .filter(l -> l.name.equals(Optional.of("drill-data")))
+        .findFirst()
+        .map(l -> l.hash);
+  }
+
+
+  private Optional<byte[]> getPeerData(Multihash peerId, String key) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()

Review comment:
       Changed in ca71f95.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432972658



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isLoopbackAddress()) {
+          continue;
+        }
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          //FIXME we don't know which local address can be reached; maybe check with InetAddress.isReachable?
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException e) {
+        continue;
+      }
+    }
+
+    return Optional.ofNullable(localAddr);
+  }
+
+  public Optional<String> getPeerDrillHostname(Multihash peerId) {
+    return getPeerData(peerId, "drill-hostname").map(Strings::fromByteArray);
+  }
+
+  public boolean isDrillReady(Multihash peerId) {
+    try {
+      return getPeerData(peerId, "drill-ready").isPresent();
+    } catch (RuntimeException e) {
+      return false;
+    }
+  }
+
+  public Optional<Multihash> getIPNSDataHash(Multihash peerId) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()
+        .filter(l -> l.name.equals(Optional.of("drill-data")))
+        .findFirst()
+        .map(l -> l.hash);
+  }
+
+
+  private Optional<byte[]> getPeerData(Multihash peerId, String key) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()

Review comment:
       Is streaming really necessary here, comparing to ordinary `for` loop?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673221990


   @dbw9580 
   Please verify that the project builds and passes all checkstyles.  `TestIPFQueries` fails the checkstyle due to unused imports.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452321224



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    private static final Logger logger = LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;

Review comment:
       Fixed in 48d2058.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-636503191


   @dbw9580 
   Thanks for contributing this.  Do you want review comments now?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r469944365



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {

Review comment:
       Actually I don't know what this method is for, as I can't find anywhere this method is called.
   This part of code was borrowed from the Kudu storage plugin: https://github.com/apache/drill/blob/0726b83d9347cbb8bd1bc64a8d10c12c1125549a/contrib/storage-kudu/src/main/java/org/apache/drill/exec/store/kudu/KuduGroupScan.java#L141-L144
   Looks like many implementations are just `return Long.compare(getTotalBytes(), o.getTotalBytes());`, and in case of IPFSWork, `getTotalBytes` always returns `DEFAULT_NODE_SIZE`, so `compareTo` should return `0`.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r449034470



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;
+  private final IPFSContext ipfsContext;
+  private final List<Multihash> ipfsSubScanSpecList;

Review comment:
       Can this just be a regular `ArrayList`?  If there's a reason why you chose to use this, that's fine, but I've not seen this done that way before.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/

Review comment:
       Please remove commented out imports. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    private static final Logger logger = LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private final String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        if (ipfsTimeouts != null) {
+            ipfsTimeoutDefaults.forEach(ipfsTimeouts::putIfAbsent);
+        } else {
+            ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.ipfsTimeouts = ipfsTimeouts;
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {
+        String host_port = String.format("%s:%d[%d,%s]", host, port, maxNodesPerLeaf, ipfsTimeouts);
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host_port == null) ? 0 : host_port.hashCode());
+        result = prime * result + ((formats == null) ? 0 : formats.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {

Review comment:
       Can this be consolidated a bit?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {

Review comment:
       `IOException` is not thrown in this method here and below.  Please remove if it is not necessary.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {
+    this.context = context;
+    this.schemaName = name;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    logger.debug("registerSchemas {}", schemaName);
+    IPFSTables schema = new IPFSTables(schemaName);
+    SchemaPlus hPlus = parent.add(schemaName, schema);
+    schema.setHolder(hPlus);
+  }
+
+  class IPFSTables extends AbstractSchema {
+    private Set<String> tableNames = Sets.newHashSet();
+    private final ConcurrentMap<String, Table> tables = new ConcurrentSkipListMap<>(String::compareToIgnoreCase);
+    public IPFSTables (String name) {
+      super(ImmutableList.<String>of(), name);
+      tableNames.add(name);
+    }
+
+    public void setHolder(SchemaPlus pulsOfThis) {

Review comment:
       Spelling... should be `plusOfThis`.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;
+  private final IPFSContext ipfsContext;
+  private final List<Multihash> ipfsSubScanSpecList;
+  private final IPFSScanSpec.Format format;
+  private final List<SchemaPath> columns;
+
+
+  @JsonCreator
+  public IPFSSubScan(@JacksonInject StoragePluginRegistry registry,
+                     @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                     @JsonProperty("IPFSSubScanSpec") @JsonDeserialize(using=MultihashDeserializer.class) List<Multihash> ipfsSubScanSpecList,
+                     @JsonProperty("format") IPFSScanSpec.Format format,
+                     @JsonProperty("columns") List<SchemaPath> columns
+                     ) throws ExecutionSetupException {
+    super((String) null);
+    IPFSStoragePlugin plugin = (IPFSStoragePlugin) registry.getPlugin(ipfsStoragePluginConfig);
+    ipfsContext = plugin.getIPFSContext();
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  public IPFSSubScan(IPFSContext ipfsContext, List<Multihash> ipfsSubScanSpecList, IPFSScanSpec.Format format, List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  @JsonIgnore
+  public IPFSContext getIPFSContext() {
+    return ipfsContext;
+  }
+
+  @JsonProperty("IPFSStoragePluginConfig")
+  public IPFSStoragePluginConfig getIPFSStoragePluginConfig() {
+    return ipfsContext.getStoragePluginConfig();
+  }
+
+  @JsonProperty("columns")
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonProperty("format")
+  public IPFSScanSpec.Format getFormat() {
+    return format;
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsSubScanSpecList)
+        .field("format", format)
+        .field("columns", columns)
+        .toString();
+  }
+
+  @JsonSerialize(using = MultihashSerializer.class)
+  @JsonProperty("IPFSSubScanSpec")
+  public List<Multihash> getIPFSSubScanSpecList() {
+    return ipfsSubScanSpecList;
+  }
+
+  @Override
+  public <T, X, E extends Throwable> T accept(
+      PhysicalVisitor<T, X, E> physicalVisitor, X value) throws E {
+    return physicalVisitor.visitSubScan(this, value);
+  }
+
+  @Override
+  public Iterator<PhysicalOperator> iterator() {
+    return ImmutableSet.<PhysicalOperator>of().iterator();
+  }
+
+  @Override
+  public int getOperatorType() {
+    return IPFS_SUB_SCAN_VALUE;
+  }
+
+  @Override
+  public boolean isExecutable() {
+    return false;
+  }
+
+  @Override
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    return new IPFSSubScan(ipfsContext, ipfsSubScanSpecList, format, columns);
+  }
+
+  public static class IPFSSubScanSpec {
+    private final String targetHash;
+
+    @JsonCreator
+    public IPFSSubScanSpec(@JsonProperty("targetHash") String targetHash) {
+      this.targetHash = targetHash;
+    }
+
+    @JsonProperty
+    public String getTargetHash() {
+      return targetHash;
+    }
+  }
+
+  static class MultihashSerializer extends JsonSerializer<List<Multihash>> {
+
+    @Override
+    public void serialize(List<Multihash> value, JsonGenerator jgen,
+                          SerializerProvider provider) throws IOException, JsonProcessingException {

Review comment:
       `JsonProcessingException` is not needed here and below. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    private static final Logger logger = LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;

Review comment:
       Do these need to have `JsonProperty` as well?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    private static final Logger logger = LoggerFactory.getLogger(IPFSStoragePluginConfig.class);

Review comment:
       You can remove the logger in this class (and elsewhere) if it isn't being used.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {
+    this.context = context;
+    this.schemaName = name;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    logger.debug("registerSchemas {}", schemaName);
+    IPFSTables schema = new IPFSTables(schemaName);
+    SchemaPlus hPlus = parent.add(schemaName, schema);
+    schema.setHolder(hPlus);
+  }
+
+  class IPFSTables extends AbstractSchema {
+    private Set<String> tableNames = Sets.newHashSet();
+    private final ConcurrentMap<String, Table> tables = new ConcurrentSkipListMap<>(String::compareToIgnoreCase);
+    public IPFSTables (String name) {
+      super(ImmutableList.<String>of(), name);
+      tableNames.add(name);
+    }
+
+    public void setHolder(SchemaPlus pulsOfThis) {
+    }
+
+    @Override
+    public String getTypeName() {
+      return IPFSStoragePluginConfig.NAME;
+    }
+
+    @Override
+    public Set<String> getTableNames() {
+      return Collections.emptySet();
+    }
+
+    @Override
+    public Table getTable(String tableName) {
+      //TODO: better handling of table names

Review comment:
       With respect to `TODO`s please either remove them OR leave them and reference a JIRA. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470091514



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSScanSpec.class);
+
+  public enum Prefix {
+    @JsonProperty("ipfs")
+    IPFS("ipfs"),
+    @JsonProperty("ipns")
+    IPNS("ipns");
+
+    @JsonProperty("prefix")
+    private final String name;
+    Prefix(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Prefix of(String what) {
+      switch (what) {
+        case "ipfs" :
+          return IPFS;
+        case "ipns":
+          return IPNS;
+        default:
+          throw new InvalidParameterException("Unsupported prefix: " + what);
+      }
+    }
+  }
+
+  public enum Format {
+    @JsonProperty("json")
+    JSON("json"),
+    @JsonProperty("csv")
+    CSV("csv");
+
+    @JsonProperty("format")
+    private final String name;
+    Format(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Format of(String what) {
+      switch (what) {
+        case "json" :
+          return JSON;
+        case "csv":
+          return CSV;
+        default:
+          throw new InvalidParameterException("Unsupported format: " + what);
+      }
+    }
+  }
+
+  public static Set<String> formats = ImmutableSet.of("json", "csv");
+  private Prefix prefix;
+  private String path;
+  private Format formatExtension;
+  private final IPFSContext ipfsContext;
+
+  @JsonCreator
+  public IPFSScanSpec (@JacksonInject StoragePluginRegistry registry,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("prefix") Prefix prefix,
+                       @JsonProperty("format") Format format,
+                       @JsonProperty("path") String path) {
+    this.ipfsContext = registry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext();
+    this.prefix = prefix;
+    this.formatExtension = format;
+    this.path = path;
+  }
+
+  public IPFSScanSpec (IPFSContext ipfsContext, String path) {
+    this.ipfsContext = ipfsContext;
+    parsePath(path);
+  }
+
+  private void parsePath(String path) {
+    //FIXME: IPFS hashes are actually Base58 encoded, so "0" "O" "I" "l" are not valid

Review comment:
       Again, please either remove, or include a reference to a JIRA to document what needs to be done. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-656479289


   > > Cleaning up the PR. I was thinking about the unit tests and it might be good to include unit tests using Mockito to mock up some of the various components. That way we can test at least some of this without the IPFS daemon. I can post an example if you'd like.
   > 
   > Would appreciate that.
   
   Take a look here for an example:
   
   https://github.com/apache/drill/blob/5900cdfaae20e216d4b87795bd2efc8199e648e6/contrib/storage-elastic/src/test/java/org/apache/drill/exec/store/elasticsearch/ElasticSearchGroupScanTest.java#L42-L96
   
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r472205448



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       @dbw9580,
   How would this plugin work if you were joining data from IPFS with data from another storage plugin?  Would that break anything?
   
   I'm wondering whether there is some way to mark an endpoint as for IPFS only or even for a particular query only so that it could not be misused and answer @vvysotskyi 's concerns. 
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432971981



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;
+  private IPFSPeer myself;
+  private IPFSStoragePluginConfig storagePluginConfig;
+  private IPFSStoragePlugin storagePlugin;
+  private LoadingCache<Multihash, IPFSPeer> ipfsPeerCache =

Review comment:
       Why not place this in constructor?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446246594



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanBatchCreator.java
##########
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.drill.common.exceptions.ChildErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedScanFramework;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedScanFramework.ReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedScanFramework.ScanFrameworkBuilder;
+import org.apache.drill.exec.physical.impl.scan.framework.SchemaNegotiator;
+import org.apache.drill.exec.record.CloseableRecordBatch;
+import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.exec.ops.ExecutorFragmentContext;
+import org.apache.drill.exec.physical.impl.BatchCreator;
+import org.apache.drill.exec.record.RecordBatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+
+public class IPFSScanBatchCreator implements BatchCreator<IPFSSubScan> {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSScanBatchCreator.class);
+
+  @Override
+  public CloseableRecordBatch getBatch(ExecutorFragmentContext context, IPFSSubScan subScan, List<RecordBatch> children)
+      throws ExecutionSetupException {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug(String.format("subScanSpecList.size = %d", subScan.getIPFSSubScanSpecList().size()));
+
+    try {
+      ScanFrameworkBuilder builder = createBuilder(context.getOptions(), subScan);
+      return builder.buildScanOperator(context, subScan);
+    } catch (UserException e) {
+      // Rethrow user exceptions directly

Review comment:
       Changed in 6542982.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443764319



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       Authorship can be identified from the git history. To specify multiple authors, you may [create a commit with multiple authors](https://help.github.com/en/github/committing-changes-to-your-project/creating-a-commit-with-multiple-authors).
   
   Over time this code may be changed by other committers, even only briefly and in minor ways, so it would be hard to keep the list of all the people, made contributions to the specific part of code




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r442901417



##########
File path: contrib/storage-ipfs/pom.xml
##########
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>drill-contrib-parent</artifactId>
+        <groupId>org.apache.drill.contrib</groupId>
+        <version>1.18.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>drill-ipfs-storage</artifactId>
+    <name>contrib/ipfs-storage-plugin</name>
+    <version>0.1.0</version>
+    <properties>
+        <ipfs.TestSuite>**/IPFSTestSuit.class</ipfs.TestSuite>
+    </properties>
+
+    <repositories>

Review comment:
       It's for the `ipfs-java-api` dependency. See https://github.com/ipfs-shipyard/java-ipfs-http-client#maven-gradle-sbt




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470098319



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /*
+   * DRILL-7753: implement a more advanced algorithm that picks optimal addresses. Maybe check reachability, latency
+   * and bandwidth?
+   */
+  /**
+   * Choose a peer's network address from its advertised Multiaddresses.
+   * Prefer globally routable address over local addresses.
+   * @param peerAddrs Multiaddresses obtained from IPFS.DHT.findprovs
+   * @return network address
+   */
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException ignored) {

Review comment:
       No. The peer is likely to have other addresses that are IP addresses, which can never cause `UnknownHostException`, so it's safe to ignore it. In the rare case where a peer has no IP addresses but an invalid hostname, the call to `pickPeerHost` will return empty. The caller should be responsible to handle a peer that has no addresses. Like what is done here: https://github.com/bdchain/drill/blob/39bab375aefe1b49af9ce358525ebd7c03543231/contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java#L444-L446




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470084386



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);

Review comment:
       I'd encourage you to use the `UserException` here and elsewhere that you think would be an error that a user might encounter.  One thing we want to avoid are error messages which simply dump the stack trace to the screen.  If you could use the `UserException` and provide a message that would help the user fix whatever went wrong, that would be best.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452321021



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    private static final Logger logger = LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private final String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        if (ipfsTimeouts != null) {
+            ipfsTimeoutDefaults.forEach(ipfsTimeouts::putIfAbsent);
+        } else {
+            ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.ipfsTimeouts = ipfsTimeouts;
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {
+        String host_port = String.format("%s:%d[%d,%s]", host, port, maxNodesPerLeaf, ipfsTimeouts);
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host_port == null) ? 0 : host_port.hashCode());
+        result = prime * result + ((formats == null) ? 0 : formats.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {

Review comment:
       Fixed in 48d2058. Just got to know about `Objects.equal()`.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r475563244



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       I think the issue with unit tests you have previously observed was the case similar to this.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-656170430


   > Cleaning up the PR. I was thinking about the unit tests and it might be good to include unit tests using Mockito to mock up some of the various components. That way we can test at least some of this without the IPFS daemon. I can post an example if you'd like.
   
   Would appreciate that.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r471582499



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       I need to tell Drill that these peers of IPFS who are also running Drill can be used when executing queries distributedly. So these Drillbit endpoints are created on the fly. Maybe limit these dynamically created endpoints to be used by this plugin only?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470087561



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);

Review comment:
       Please include a message here. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674045895


   The `connection rejected: /127.0.0.1:31011` failure was because sometimes Drill does not bind to the default ports (`31010, 31011, 31012`). It can bind to later ports like `31013, 31014, 31015`, hence the connection was rejected.
   
   I believe the reason Drill didn't bind to the default ports is that those ports was used by the process from the last test run and had not been recycled by the system. If I wait for a minute or two before starting another round of testing, it's likely the test will pass. 
   
   This is part of DRILL-7754, but I haven't come up with a plan to reliably store the ports info in IPFS.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674940667


   @dbw9580 
   Can you please rebase on current master as well?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-646107713


   Yes I'll try my best. I was stuck with CSV and writer support, and haven't had much progress so far. Can we settle with basic JSON and reader support for now, and maybe add those later?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674062414


   @dbw9580 
   The unit tests are passing now on my machine.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432973481



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {
+            ipfsTimeoutDefaults.forEach(ipfsTimeouts::putIfAbsent);
+        } else {
+            ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.ipfsTimeouts = ipfsTimeouts;
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {
+        String host_port = String.format("%s:%d[%d,%s]", host, port, maxNodesPerLeaf, ipfsTimeouts);
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host_port == null) ? 0 : host_port.hashCode());
+        result = prime * result + ((formats == null) ? 0 : formats.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {
+            return true;
+        }
+        if (obj == null) {
+            return false;
+        }
+        if (getClass() != obj.getClass()) {

Review comment:
       This can be consolidate in single above `if` with `if (obj == null || getClass() != ...)`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470676371



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSScanSpec.class);
+
+  public enum Prefix {
+    @JsonProperty("ipfs")
+    IPFS("ipfs"),
+    @JsonProperty("ipns")
+    IPNS("ipns");
+
+    @JsonProperty("prefix")
+    private final String name;
+    Prefix(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Prefix of(String what) {
+      switch (what) {
+        case "ipfs" :
+          return IPFS;
+        case "ipns":
+          return IPNS;
+        default:
+          throw new InvalidParameterException("Unsupported prefix: " + what);
+      }
+    }
+  }
+
+  public enum Format {
+    @JsonProperty("json")
+    JSON("json"),
+    @JsonProperty("csv")
+    CSV("csv");
+
+    @JsonProperty("format")
+    private final String name;
+    Format(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Format of(String what) {
+      switch (what) {
+        case "json" :
+          return JSON;
+        case "csv":
+          return CSV;
+        default:
+          throw new InvalidParameterException("Unsupported format: " + what);
+      }
+    }
+  }
+
+  public static Set<String> formats = ImmutableSet.of("json", "csv");
+  private Prefix prefix;
+  private String path;
+  private Format formatExtension;
+  private final IPFSContext ipfsContext;
+
+  @JsonCreator
+  public IPFSScanSpec (@JacksonInject StoragePluginRegistry registry,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("prefix") Prefix prefix,
+                       @JsonProperty("format") Format format,
+                       @JsonProperty("path") String path) {
+    this.ipfsContext = registry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext();
+    this.prefix = prefix;
+    this.formatExtension = format;
+    this.path = path;
+  }
+
+  public IPFSScanSpec (IPFSContext ipfsContext, String path) {
+    this.ipfsContext = ipfsContext;
+    parsePath(path);
+  }
+
+  private void parsePath(String path) {
+    //FIXME: IPFS hashes are actually Base58 encoded, so "0" "O" "I" "l" are not valid

Review comment:
       Removed in d2ea637.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r471546082



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static final int IPFS_SUB_SCAN_VALUE = 19155;

Review comment:
       Done.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-737693796


   > @dbw9580
   > Hi there! I hope all is well. Are you still interested in completing this PR?
   
   Yes. I'm currently busy with other projects, and haven't had time to look further into this. I remember we were having some discussions about the way this plugin interacts with the Drill coordinator that needs a major design reconsideration. When I can spare more time on this, I will continue where I left off.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432973530



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePluginConfig.class);
+
+    public static final String NAME = "ipfs";
+
+    private final String host;
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    //TODO add more specific timeout configs fot different operations in IPFS,
+    // eg. provider resolution, data read, etc.
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        //TODO Jackson failed to deserialize the ipfsTimeouts map causing NPE
+        if (ipfsTimeouts != null) {
+            ipfsTimeoutDefaults.forEach(ipfsTimeouts::putIfAbsent);
+        } else {
+            ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.ipfsTimeouts = ipfsTimeouts;
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {
+        String host_port = String.format("%s:%d[%d,%s]", host, port, maxNodesPerLeaf, ipfsTimeouts);
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((host_port == null) ? 0 : host_port.hashCode());
+        result = prime * result + ((formats == null) ? 0 : formats.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) {
+            return true;
+        }
+        if (obj == null) {
+            return false;
+        }
+        if (getClass() != obj.getClass()) {
+            return false;
+        }
+        IPFSStoragePluginConfig other = (IPFSStoragePluginConfig) obj;
+        if (formats == null) {
+            if (other.formats != null) {
+                return false;
+            }
+        } else if (!formats.equals(other.formats)) {
+            return false;
+        }
+        if (host == null) {

Review comment:
       ```java
   if (host == null || other.host != null) {
    return false;
   }
   if (!host.equals(other.host)) {
    ...
   }
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470681998



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.base.Objects;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Maps;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    public static final String NAME = "ipfs";
+
+    @JsonProperty
+    private final String host;
+
+    @JsonProperty
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private final String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        if (ipfsTimeouts != null) {
+            this.ipfsTimeouts = Maps.newHashMap();
+            ipfsTimeouts.forEach(this.ipfsTimeouts::put);
+            ipfsTimeoutDefaults.forEach(this.ipfsTimeouts::putIfAbsent);
+        } else {
+            this.ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {

Review comment:
       Fixed in 30094a8.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674540984


   Found that if I ran tests with `mvn clean test -DforkMode=never`, then the `port already in use` errors were gone. Have no idea why.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432972913



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isLoopbackAddress()) {
+          continue;
+        }
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          //FIXME we don't know which local address can be reached; maybe check with InetAddress.isReachable?
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException e) {
+        continue;
+      }
+    }
+
+    return Optional.ofNullable(localAddr);
+  }
+
+  public Optional<String> getPeerDrillHostname(Multihash peerId) {
+    return getPeerData(peerId, "drill-hostname").map(Strings::fromByteArray);
+  }
+
+  public boolean isDrillReady(Multihash peerId) {
+    try {
+      return getPeerData(peerId, "drill-ready").isPresent();
+    } catch (RuntimeException e) {
+      return false;
+    }
+  }
+
+  public Optional<Multihash> getIPNSDataHash(Multihash peerId) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()
+        .filter(l -> l.name.equals(Optional.of("drill-data")))
+        .findFirst()
+        .map(l -> l.hash);
+  }
+
+
+  private Optional<byte[]> getPeerData(Multihash peerId, String key) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()
+        .filter(l -> l.name.equals(Optional.of(key)))
+        .findFirst()
+        .map(l -> {
+          try {
+            return client.object.data(l.hash);
+          } catch (IOException e) {
+            return null;
+          }
+        });
+  }
+
+  private Optional<List<MerkleNode>> getPeerLinks(Multihash peerId) {
+    try {
+      Optional<String> optionalPath = client.name.resolve(peerId, 30);
+      if (!optionalPath.isPresent()) {
+        return Optional.empty();
+      }
+      String path = optionalPath.get().substring(6); // path starts with /ipfs/Qm...
+
+      List<MerkleNode> links = client.object.get(Multihash.fromBase58(path)).links;
+      if (links.size() < 1) {

Review comment:
       ```java
   try {
     if (links.size() > 0) {
       return Optional.of(links);
     }
   catch (IOException e) {
     continue;
   }
   
   return Optional.empty();
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r467934049



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       Ah ok..  Seems like we need to update Drill to use a more recent version of Java..




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470089240



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  /**
+   * As defined in https://github.com/libp2p/go-libp2p-core/blob/b77fd280f2bfcce22f10a000e8e1d9ec53c47049/routing/query.go#L16
+   */
+  public enum DHTQueryEventType {
+    // Sending a query to a peer.
+    SendingQuery,
+    // Got a response from a peer.
+    PeerResponse,
+    // Found a "closest" peer (not currently used).
+    FinalPeer,
+    // Got an error when querying.
+    QueryError,
+    // Found a provider.
+    Provider,
+    // Found a value.
+    Value,

Review comment:
       No. I included them for sake of completeness. Should they be removed?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452281356



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {
+    this.context = context;
+    this.schemaName = name;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    logger.debug("registerSchemas {}", schemaName);
+    IPFSTables schema = new IPFSTables(schemaName);
+    SchemaPlus hPlus = parent.add(schemaName, schema);
+    schema.setHolder(hPlus);
+  }
+
+  class IPFSTables extends AbstractSchema {
+    private Set<String> tableNames = Sets.newHashSet();
+    private final ConcurrentMap<String, Table> tables = new ConcurrentSkipListMap<>(String::compareToIgnoreCase);
+    public IPFSTables (String name) {
+      super(ImmutableList.<String>of(), name);
+      tableNames.add(name);
+    }
+
+    public void setHolder(SchemaPlus pulsOfThis) {
+    }
+
+    @Override
+    public String getTypeName() {
+      return IPFSStoragePluginConfig.NAME;
+    }
+
+    @Override
+    public Set<String> getTableNames() {
+      return Collections.emptySet();
+    }
+
+    @Override
+    public Table getTable(String tableName) {
+      //TODO: better handling of table names

Review comment:
       In that case, perhaps create a JIRA and reference it in the code comments. It's fine with me to leave the code, but just please put an explanation of why it's there and what the plans are. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446225725



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    //FIXME why 100000 * size?
+    long recordCount = 100000 * endpointWorksMap.size();

Review comment:
       This should be part of DRILL-7755, I think.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452319373



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/

Review comment:
       Fixed in 0f9c2db.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r483963623



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       @vvysotskyi @cgivre I think to refactor the cluster coordinator and the planner is beyond me. I actually tried to create a endpoint registry that manages them according to the plugin they are registered with. But that didn't work out. The attempt is here: https://github.com/bdchain/drill/commit/85e7d2aec4aba370d12bd052eeb9f5760578886a
   In 0c34b56 I added a manual switch in the config to allow the user to choose whether to run queries in distributed mode. If it is set to false, then no endpoints will be registered on the fly, but the plan will be executed by the foreman, just like the way the HTTP storage plugin works. I hope by doing so (and clarifying in the doc) we can mitigate the problem a bit.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470098319



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /*
+   * DRILL-7753: implement a more advanced algorithm that picks optimal addresses. Maybe check reachability, latency
+   * and bandwidth?
+   */
+  /**
+   * Choose a peer's network address from its advertised Multiaddresses.
+   * Prefer globally routable address over local addresses.
+   * @param peerAddrs Multiaddresses obtained from IPFS.DHT.findprovs
+   * @return network address
+   */
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException ignored) {

Review comment:
       No. The peer is likely to have other addresses that are IP addresses, which can never cause `UnknownHostException`, so it's safe to ignore it. In the rare case where a peer has no IP addresses but an invalid hostname, the call to `pickPeerHost` will return empty. The caller should be responsible to handle a peer that has no addresses. Like what is done here: https://github.com/bdchain/drill/blob/39bab375aefe1b49af9ce358525ebd7c03543231/contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java#L444-446




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444246636



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       Yes, adding names to the Jira and PR description sounds good to me. By the way, Jira has {{Assignee}} filed. I've added your Jira account to the contributor role and assigned it's Jira to you.
   
   Glad to hear that such contributions are made as a research project.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r467938032



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       Let me make sure I understand this:
   Drill can't query IPFS version > 0.4.2 due to library restrictions.  We can't simply upgrade the library because it requires Java 11 and Drill is built on Java 8.  Is that correct?
   
   (Sorry.. not an expert on IPFS, and I just want to make sure I'm understanding all this.)  
   How criticial would you say this is for functionality?  Is there some workaround possible so that Drill will work with the latest IPFS version?
   
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r471131681



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static final int IPFS_SUB_SCAN_VALUE = 19155;

Review comment:
       One more thing... make sure that you are using the correct version(s) of `protoc` on your machine otherwise, the CI will reject your protobufs.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673621711


   If I leave an instance of Drill running and then run the unit test (`TestIPFSQueries`), then it passes. I think the unit test does not actually build and  run a full Drill server, which is why the connections are rejected.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446226311



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    //FIXME why 100000 * size?
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    //FIXME what does this mean?

Review comment:
       Removed in b6fcc0d




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r472253999



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       Fixed in 41dce52.
   
   > Should this now work will all versions of IPFS?
   
   Probably. At least 0.4.23 and 0.6.0 that I tested with should work.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674957594


   @dbw9580 
   This is looking pretty good. I'm going to do a final check this evening or tomorrow, but can you please:
   1.  Squash all commits and add message of `DRILL-7745: Add storage plugin for IPFS` as the commit message
   2.  Go through and do a final code hygiene check (make sure there are no extra spaces, commented out blocks etc). Drill does have a code formatter[1] and just verify that the code complies with the coding standards for spacing and all that.  (I didn't see anything jump out at me, but it always helps to double check)
   
   Thanks!
   
   [1]: https://drill.apache.org/docs/apache-drill-contribution-guidelines/
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470676806



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSScanSpec.class);
+
+  public enum Prefix {
+    @JsonProperty("ipfs")
+    IPFS("ipfs"),
+    @JsonProperty("ipns")
+    IPNS("ipns");
+
+    @JsonProperty("prefix")
+    private final String name;
+    Prefix(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Prefix of(String what) {
+      switch (what) {
+        case "ipfs" :
+          return IPFS;
+        case "ipns":
+          return IPNS;
+        default:
+          throw new InvalidParameterException("Unsupported prefix: " + what);
+      }
+    }
+  }
+
+  public enum Format {
+    @JsonProperty("json")
+    JSON("json"),
+    @JsonProperty("csv")
+    CSV("csv");
+
+    @JsonProperty("format")
+    private final String name;
+    Format(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Format of(String what) {
+      switch (what) {
+        case "json" :
+          return JSON;
+        case "csv":
+          return CSV;
+        default:
+          throw new InvalidParameterException("Unsupported format: " + what);
+      }
+    }
+  }
+
+  public static Set<String> formats = ImmutableSet.of("json", "csv");
+  private Prefix prefix;
+  private String path;
+  private Format formatExtension;
+  private final IPFSContext ipfsContext;
+
+  @JsonCreator
+  public IPFSScanSpec (@JacksonInject StoragePluginRegistry registry,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("prefix") Prefix prefix,
+                       @JsonProperty("format") Format format,
+                       @JsonProperty("path") String path) {
+    this.ipfsContext = registry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext();
+    this.prefix = prefix;
+    this.formatExtension = format;
+    this.path = path;
+  }
+
+  public IPFSScanSpec (IPFSContext ipfsContext, String path) {
+    this.ipfsContext = ipfsContext;
+    parsePath(path);
+  }
+
+  private void parsePath(String path) {
+    //FIXME: IPFS hashes are actually Base58 encoded, so "0" "O" "I" "l" are not valid
+    //also CIDs can be encoded with different encodings, not necessarily Base58
+    Pattern tableNamePattern = Pattern.compile("^/(ipfs|ipns)/([A-Za-z0-9]{46}(/[^#]+)*)(?:#(\\w+))?$");
+    Matcher matcher = tableNamePattern.matcher(path);
+    if (!matcher.matches()) {
+      throw UserException.validationError().message("Invalid IPFS path in query string. Use paths of pattern `/scheme/hashpath#format`, where scheme:= \"ipfs\"|\"ipns\", hashpath:= HASH [\"/\" path], HASH is IPFS Base58 encoded hash, path:= TEXT [\"/\" path], format:= \"json\"|\"csv\"").build(logger);

Review comment:
       Fixed in d2ea637.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r469626333



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       Cool, let's just keep an eye on that for now. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443753136



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isLoopbackAddress()) {

Review comment:
       Removed in ca71f95.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470082144



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  /**
+   * As defined in https://github.com/libp2p/go-libp2p-core/blob/b77fd280f2bfcce22f10a000e8e1d9ec53c47049/routing/query.go#L16
+   */
+  public enum DHTQueryEventType {
+    // Sending a query to a peer.
+    SendingQuery,
+    // Got a response from a peer.
+    PeerResponse,
+    // Found a "closest" peer (not currently used).
+    FinalPeer,
+    // Got an error when querying.
+    QueryError,
+    // Found a provider.
+    Provider,
+    // Found a value.
+    Value,

Review comment:
       Are `Value`, `AddingPeer` and `DialingPeer` ever used?  




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446213339



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Consumer;
+
+/*
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  public class DHT {
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {

Review comment:
       Thanks!  Very helpful!




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443752093



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheBuilder;
+import org.apache.drill.shaded.guava.com.google.common.cache.CacheLoader;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+public class IPFSContext {
+  private IPFS ipfsClient;
+  private IPFSHelper ipfsHelper;
+  private IPFSPeer myself;
+  private IPFSStoragePluginConfig storagePluginConfig;
+  private IPFSStoragePlugin storagePlugin;
+  private LoadingCache<Multihash, IPFSPeer> ipfsPeerCache =

Review comment:
       Fixed in 48f3b80




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre edited a comment on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre edited a comment on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-637690257


   @dbw9580 
   One more comment.  You'll want to add your plugin to the distribution files so that it will be built when Drill is built.
   
   You'll have to do that here:
   https://github.com/apache/drill/blob/7d5b6116ba524769f8ba43ff03291eff62de1205/distribution/pom.xml#L300-L304
   
   and here:
   https://github.com/apache/drill/blob/7d5b6116ba524769f8ba43ff03291eff62de1205/distribution/src/assemble/component.xml#L28-L56
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443756725



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSScanSpec.class);

Review comment:
       2f26c33e




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-636504741


   Yes, please. 
   
   Some major problems I'm working on are:
     * support for different data formats. Currently only JSON files are supported, and CSV support is removed due to changes introduced in v1.17 and v1.18. The original implementation was a copy-paste of the easy format plugin (`org/apache/drill/exec/store/easy/text/TextFormatPlugin.java`). I wonder if there is a better way to reuse the code there.
     * support for `CREATE TABLE` statements. This will require changes to the Drill framework responsible for SQL parsing, query plan generation, etc. I would appreciate opinions from you.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446245784



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePlugin.java
##########
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.ipfs.api.IPFS;
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.drill.common.JSONOptions;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.store.AbstractStoragePlugin;
+import org.apache.drill.exec.store.SchemaConfig;
+
+import java.io.IOException;
+import java.util.List;
+
+public class IPFSStoragePlugin extends AbstractStoragePlugin {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSStoragePlugin.class);
+
+  private final IPFSContext ipfsContext;
+  private final IPFSStoragePluginConfig pluginConfig;
+  private final IPFSSchemaFactory schemaFactory;
+  private final IPFS ipfsClient;
+
+  public IPFSStoragePlugin(IPFSStoragePluginConfig config, DrillbitContext context, String name) throws IOException {
+    super(context, name);
+    this.ipfsClient = new IPFS(config.getHost(), config.getPort());
+    this.ipfsContext = new IPFSContext(config, this, ipfsClient);
+    this.schemaFactory = new IPFSSchemaFactory(this.ipfsContext, name);
+    this.pluginConfig = config;
+  }
+
+  @Override
+  public boolean supportsRead() {
+    return true;
+  }
+
+  @Override
+  public boolean supportsWrite() {
+    return true;
+  }
+
+  @Override
+  public IPFSGroupScan getPhysicalScan(String userName, JSONOptions selection) throws IOException {

Review comment:
       Changed in 3b6f5d2.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r467923605



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       It upgraded target Java version to 11: https://github.com/ipfs-shipyard/java-ipfs-http-client/commit/6c0016c00b9a3cd213343fa25adb5099be52a401
   Drill's still using Java 8, I'm not sure we can do this.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446261666



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree

Review comment:
       Thanks!  
   Are any of these additional JIRAs critical to using this plugin?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-637690257


   @dbw9580 
   One more comment.  You'll want to add your plugin to the distribution files so that it will be built when Drill is built.
   
   You'll have to do that here:
   https://github.com/apache/drill/blob/7d5b6116ba524769f8ba43ff03291eff62de1205/distribution/pom.xml#L300-304
   
   and here:
   https://github.com/apache/drill/blob/7d5b6116ba524769f8ba43ff03291eff62de1205/distribution/src/assemble/component.xml#L28-L56
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674079153


   @cgivre 
   I tried to set the ports to their default values in c090a43, but it did not seem to do the trick. Why is that?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444243879



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       > a permanent link for attribution
   
   Yes.
   
   > name change would be lost in the shuffle
   
   Can we squash all the commits that are real code changes into one commit, followed by another two commits that simply add/delete the names?
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446280165



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree

Review comment:
       Not really, they are more of improvements.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r469944365



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private class IPFSWork implements CompleteWork {
+    private EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {

Review comment:
       Actually I don't know what this method is for, as I can't find anywhere this method is called.
   This part of code was borrowed from the Kudu storage plugin: https://github.com/apache/drill/blob/0726b83d9347cbb8bd1bc64a8d10c12c1125549a/contrib/storage-kudu/src/main/java/org/apache/drill/exec/store/kudu/KuduGroupScan.java#L143
   Looks like many implementations are just `return Long.compare(getTotalBytes(), o.getTotalBytes());`, and in case of IPFSWork, `getTotalBytes` always returns `DEFAULT_NODE_SIZE`, so `compareTo` should return `0`.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446225215



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));

Review comment:
       Removed in b6fcc0df.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673636877


   @dbw9580 
   The `ClusterTest` class is supposed to start a Drill cluster so that you can execute queries.  You should not need to have a Drill cluster running for the unit tests to complete.  
   
   I think the reason this isn't doing what you're expecting is that in the `initIPFS` function in `IPFSTestSuit` you are creating a plugin with a null configuration and hence isn't initializing correctly.   
   
   I stepped through `testSimple()` with the debugger and the `dataset` object is `null`, hence the test fails.  My suspicion is that there is one small step being missed here.  Could you please take a look and step through this to make sure that Drill is being initialized correctly?
   Thanks
   
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443526690



##########
File path: contrib/storage-ipfs/README.zh.md
##########
@@ -0,0 +1,184 @@
+# Drill Storage Plugin for IPFS

Review comment:
       Please keep only `README.md` in English, since it would be problematic to update it for other developers.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       Please remove the custom license.

##########
File path: contrib/storage-ipfs/pom.xml
##########
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>drill-contrib-parent</artifactId>
+        <groupId>org.apache.drill.contrib</groupId>
+        <version>1.18.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>drill-ipfs-storage</artifactId>
+    <name>contrib/ipfs-storage-plugin</name>
+    <version>0.1.0</version>
+    <properties>
+        <ipfs.TestSuite>**/IPFSTestSuit.class</ipfs.TestSuite>
+    </properties>
+
+    <repositories>

Review comment:
       Drill parent pom already has a jitpack repository, os there is no need to duplicate it here.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470675112



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    List<Multihash> scanSpecList = Lists.newArrayList();
+    if (workList != null) {
+      logger.debug("workList.size(): {}", workList.size());
+
+      for (IPFSWork work : workList) {
+        scanSpecList.add(work.getPartialRootHash());
+      }
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private static class IPFSWork implements CompleteWork {
+    private final EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private final Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {
+      return 0;
+    }
+
+    @Override
+    public String toString() {
+      return "IPFSWork [root = " + partialRoot.toString() + "]";

Review comment:
       Fixed in d2ea637.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674043258


   @dbw9580 I believe Drill does support connections from IPv6 sockets.  There was a recent PR for this in fact: (https://github.com/apache/drill/pull/1857) but I'm not sure if that is directly relevant. 
   Were you able to get it working?
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-646094399


   @dbw9580 
   We're looking to get a release of Drill out by the end of June.  If you can get these revisions done soon, I can help expedite the review so that we can get this (or at least an MVP of this) into Drill 1.18. 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443908319



##########
File path: contrib/storage-ipfs/README.md
##########
@@ -0,0 +1,182 @@
+# Drill Storage Plugin for IPFS
+
+[中文](README.zh.md)
+
+## Contents
+
+0. [Introduction](#Introduction)
+1. [Compile](#Compile)
+2. [Install](#Install)
+2. [Configuration](#Configuration)
+3. [Run](#Run)
+
+## Introduction
+
+Minerva is a storage plugin of Drill that connects IPFS's decentralized storage and Drill's flexible query engine. Any data file stored on IPFS can be easily accessed from Drill's query interface, just like a file stored on a local disk. Moreover, with Drill's capability of distributed execution, other instances who are also running Minerva can help accelerate the execution: the data stays where it was, and the queries go to the most suitable nodes which stores the data locally and from there the operations can be performed most efficiently. 
+
+Slides that explain our ideas and the technical details of Minerva: <https://www.slideshare.net/BowenDing4/minerva-ipfs-storage-plugin-for-ipfs>
+
+A live demo: <http://www.datahub.pub/> hosted on a private cluster of Minerva.
+
+Note that it's still in early stages of development and the overall stability and performance is not satisfactory. PRs are very much welcome!

Review comment:
       When we're ready to commit, can you please update the docs and remove all the language about building Drill etc. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree

Review comment:
       Please either remove `TODO` OR leave it and reference a JIRA. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));

Review comment:
       This seems like it is just generating log messages.  Should this be doing something else?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Consumer;
+
+/*
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  public class DHT {
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      BlockingQueue<CompletableFuture<Object>> results = new LinkedBlockingQueue<>();
+      executor.submit(() -> retrieveAndParseStream("dht/findpeer?arg=" + id, results));
+
+      try {
+        long stop = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(timeout);
+        while(System.currentTimeMillis() < stop) {
+          Map peer = (Map) results.poll(timeout, TimeUnit.SECONDS);
+          if ( peer != null ) {
+            if ( (int) peer.get("Type") == 2 ) {
+              return (List<String>)
+                  ((Map)
+                      ((List) peer.get("Responses")
+                      ).get(0)
+                  ).get("Addrs");
+            }
+            //else: response contains no Addrs, so ignore it.

Review comment:
       Please remove commented out code.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应

Review comment:
       Please remove or translate Chinese comments.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {

Review comment:
       Does this need to be here?

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Consumer;
+
+/*
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  public class DHT {
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {

Review comment:
       As a request for code reviewers who are not experts in IPFS, would you please add some JavaDocs here and elsewhere so we can follow what's going on? 
   

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       @vvysotskyi 
   Can @dbw9580 put the authors names and perhaps contact info in the `README`?  Personally, I don't have a problem with that. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads

Review comment:
       ??

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    //FIXME why 100000 * size?
+    long recordCount = 100000 * endpointWorksMap.size();

Review comment:
       This is actually really important.  Calcite uses a cost-based planner.  If you are pushing down filters, aggregates etc. you want to make sure that the scan stats go down as you push down the filters.  There's no penalty for setting this to an arbitrary large money to start with. 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanBatchCreator.java
##########
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.drill.common.exceptions.ChildErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedScanFramework;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedScanFramework.ReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedScanFramework.ScanFrameworkBuilder;
+import org.apache.drill.exec.physical.impl.scan.framework.SchemaNegotiator;
+import org.apache.drill.exec.record.CloseableRecordBatch;
+import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.exec.ops.ExecutorFragmentContext;
+import org.apache.drill.exec.physical.impl.BatchCreator;
+import org.apache.drill.exec.record.RecordBatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+
+public class IPFSScanBatchCreator implements BatchCreator<IPFSSubScan> {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSScanBatchCreator.class);
+
+  @Override
+  public CloseableRecordBatch getBatch(ExecutorFragmentContext context, IPFSSubScan subScan, List<RecordBatch> children)
+      throws ExecutionSetupException {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug(String.format("subScanSpecList.size = %d", subScan.getIPFSSubScanSpecList().size()));
+
+    try {
+      ScanFrameworkBuilder builder = createBuilder(context.getOptions(), subScan);
+      return builder.buildScanOperator(context, subScan);
+    } catch (UserException e) {
+      // Rethrow user exceptions directly

Review comment:
       This can be consolidated and just throw a `UserException`

##########
File path: contrib/storage-ipfs/src/test/java/org/apache/drill/exec/store/ipfs/IPFSTestBase.java
##########
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+

Review comment:
       Nit:  Extra lines

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    //FIXME why 100000 * size?
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    //FIXME what does this mean?

Review comment:
       Remove comment.  This tells the planner that projection can be pushed down to the reader(s). 

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads
+              providers = providers.stream()
+                  .filter(IPFSPeer::isDrillReady)
+                  .collect(Collectors.toList());
+              if (providers.size() < 1) {
+                logger.warn("No drill-ready provider found for leaf {}, adding foreman as the provider", hash);
+                providers.add(ipfsContext.getMyself());
+              }
+
+              logger.debug("Got {} providers for {} from IPFS", providers.size(), hash);
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (IPFSPeer provider : providers.subList(1, providers.size())) {
+                builder.add(new IPFSTreeFlattener(provider.getId(), true));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              List<String> possibleAddrs = new LinkedList<>();
+              Multihash firstProvider = providers.get(0).getId();
+              IPFSTreeFlattener firstTask = new IPFSTreeFlattener(firstProvider, true);
+              String firstAddr = firstTask.compute().get(firstProvider);
+              if (firstAddr != null) {
+                possibleAddrs.add(firstAddr);
+              }
+
+              subtasks.reverse().forEach(
+                  subtask -> {
+                    String addr = subtask.join().get(subtask.hash);
+                    if (addr != null) {
+                      possibleAddrs.add(addr);
+                    }
+                  }
+              );
+
+              if (possibleAddrs.size() < 1) {
+                logger.error("All attempts to find an appropriate provider address for {} have failed", hash);
+                throw new RuntimeException("No address found for any provider for leaf " + hash);
+              } else {
+                Random random = new Random();
+                String chosenAddr = possibleAddrs.get(random.nextInt(possibleAddrs.size()));
+                ret.clear();
+                ret.put(hash, chosenAddr);
+                logger.debug("Got peer host {} for leaf {}", chosenAddr, hash);
+              }
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+          return ret;
+        }
+      }
+
+      logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+
+      Stopwatch watch = Stopwatch.createStarted();
+      //FIXME parallelization width magic number, maybe a config entry?
+      ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+      IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false);
+      Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+
+      logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //TODO read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(31010)
+              .setControlPort(31011)
+              .setDataPort(31012)
+              .setHttpPort(8047)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size()>1) { //偶尔还会出错?
+      //incomingEndpoints是已经排好顺序的endpoints,和fragment 顺序对应
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) { //如果对应的节点有工作
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        } else //如果对应的节点没有工作安排,分配一个空work
+        {
+
+        }
+      }
+    }
+    else //如果出问题,按照系统默认分配模式?
+    {
+     logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+
+    for (int i = 0; i < incomingEndpoints.size(); i++) {
+      logger.debug("Fragment {} on endpoint {} is assigned with works: {}", i, incomingEndpoints.get(i).getAddress(), assignments.get(i));
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    logger.debug("workList == null: " + (workList == null? "true": "false"));
+    logger.debug(String.format("workList.size(): %d", workList.size()));
+
+    List<Multihash> scanSpecList = Lists.newArrayList();
+
+    for (IPFSWork work : workList) {
+      scanSpecList.add(work.getPartialRootHash());
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    //FIXME why 100000 * size?
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    //FIXME what does this mean?
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {

Review comment:
       Please use `PlanStringBuilder` here and in other classes which are serialized.  
   
   See below:
   https://github.com/apache/drill/blob/1b95c0a8cfce23e11596353a821a5216fd1a983d/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpGroupScan.java#L228-L235

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;
+  private final IPFSContext ipfsContext;
+  private final List<Multihash> ipfsSubScanSpecList;
+  private final IPFSScanSpec.Format format;
+  private final List<SchemaPath> columns;
+
+
+  @JsonCreator
+  public IPFSSubScan(@JacksonInject StoragePluginRegistry registry,
+                     @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                     @JsonProperty("IPFSSubScanSpec") @JsonDeserialize(using=MultihashDeserializer.class) List<Multihash> ipfsSubScanSpecList,
+                     @JsonProperty("format") IPFSScanSpec.Format format,
+                     @JsonProperty("columns") List<SchemaPath> columns
+                     ) throws ExecutionSetupException {
+    super((String) null);
+    IPFSStoragePlugin plugin = (IPFSStoragePlugin) registry.getPlugin(ipfsStoragePluginConfig);
+    ipfsContext = plugin.getIPFSContext();
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  public IPFSSubScan(IPFSContext ipfsContext, List<Multihash> ipfsSubScanSpecList, IPFSScanSpec.Format format, List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  @JsonIgnore
+  public IPFSContext getIPFSContext() {
+    return ipfsContext;
+  }
+
+  @JsonProperty("IPFSStoragePluginConfig")
+  public IPFSStoragePluginConfig getIPFSStoragePluginConfig() {
+    return ipfsContext.getStoragePluginConfig();
+  }
+
+  @JsonProperty("columns")
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonProperty("format")
+  public IPFSScanSpec.Format getFormat() {
+    return format;
+  }
+

Review comment:
       I think you might want to overwrite the `toString()` method in the SubScan. Again, use the `PlanStringBuilder` for this.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432973089



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSPeer.java
##########
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+
+public class IPFSPeer {
+  private IPFSHelper helper;
+
+  private Multihash id;
+  private List<MultiAddress> addrs;
+  private boolean isDrillReady;
+  private boolean isDrillReadyChecked = false;
+  private Optional<String> drillbitAddress = Optional.empty();
+  private boolean drillbitAddressChecked = false;
+
+
+  public IPFSPeer(IPFSHelper helper, Multihash id) {
+    this.helper = helper;
+    this.id = id;
+  }
+
+  IPFSPeer(IPFSHelper helper, Multihash id, List<MultiAddress> addrs) {
+    this.helper = helper;
+    this.id = id;
+    this.addrs = addrs;
+    this.isDrillReady = helper.isDrillReady(id);
+    this.isDrillReadyChecked = true;
+    this.drillbitAddress = IPFSHelper.pickPeerHost(addrs);
+    this.drillbitAddressChecked = true;
+  }
+
+  public boolean isDrillReady() {
+    if (!isDrillReadyChecked) {
+      isDrillReady = helper.isDrillReady(id);
+      isDrillReadyChecked = true;
+    }
+    return isDrillReady;
+  }
+
+  public boolean hasDrillbitAddress() {

Review comment:
       Can call `getDrillbitAddress().isPresent()` instead.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446246414



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;
+  private final IPFSContext ipfsContext;
+  private final List<Multihash> ipfsSubScanSpecList;
+  private final IPFSScanSpec.Format format;
+  private final List<SchemaPath> columns;
+
+
+  @JsonCreator
+  public IPFSSubScan(@JacksonInject StoragePluginRegistry registry,
+                     @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                     @JsonProperty("IPFSSubScanSpec") @JsonDeserialize(using=MultihashDeserializer.class) List<Multihash> ipfsSubScanSpecList,
+                     @JsonProperty("format") IPFSScanSpec.Format format,
+                     @JsonProperty("columns") List<SchemaPath> columns
+                     ) throws ExecutionSetupException {
+    super((String) null);
+    IPFSStoragePlugin plugin = (IPFSStoragePlugin) registry.getPlugin(ipfsStoragePluginConfig);
+    ipfsContext = plugin.getIPFSContext();
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  public IPFSSubScan(IPFSContext ipfsContext, List<Multihash> ipfsSubScanSpecList, IPFSScanSpec.Format format, List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  @JsonIgnore
+  public IPFSContext getIPFSContext() {
+    return ipfsContext;
+  }
+
+  @JsonProperty("IPFSStoragePluginConfig")
+  public IPFSStoragePluginConfig getIPFSStoragePluginConfig() {
+    return ipfsContext.getStoragePluginConfig();
+  }
+
+  @JsonProperty("columns")
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonProperty("format")
+  public IPFSScanSpec.Format getFormat() {
+    return format;
+  }
+

Review comment:
       Changed in 6542982.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443756935



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+
+import java.io.IOException;
+import java.security.InvalidParameterException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSScanSpec.class);
+
+  public enum Prefix {
+    @JsonProperty("ipfs")
+    IPFS("ipfs"),
+    @JsonProperty("ipns")
+    IPNS("ipns");
+
+    @JsonProperty("prefix")
+    private String name;
+    Prefix(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {

Review comment:
       Fixed in f2d6358




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674942760


   @cgivre I think it's based on the current master already.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432972079



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);

Review comment:
       Duplicate code with `setExecutorService()`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-647859452


   > > @dbw9580
   > > This is definitely making progress.  Will testing require an IPFS installation?
   > 
   > Yes, a running IPFS daemon is required.
   
   Can you write some tests that do not require the IPFS daemon?  I realize that we'll need it for some unit tests, but is it possible to test various components w/o the daemon?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452279329



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {
+    this.context = context;
+    this.schemaName = name;
+  }
+
+  @Override
+  public void registerSchemas(SchemaConfig schemaConfig, SchemaPlus parent) throws IOException {
+    logger.debug("registerSchemas {}", schemaName);
+    IPFSTables schema = new IPFSTables(schemaName);
+    SchemaPlus hPlus = parent.add(schemaName, schema);
+    schema.setHolder(hPlus);
+  }
+
+  class IPFSTables extends AbstractSchema {
+    private Set<String> tableNames = Sets.newHashSet();
+    private final ConcurrentMap<String, Table> tables = new ConcurrentSkipListMap<>(String::compareToIgnoreCase);
+    public IPFSTables (String name) {
+      super(ImmutableList.<String>of(), name);
+      tableNames.add(name);
+    }
+
+    public void setHolder(SchemaPlus pulsOfThis) {
+    }
+
+    @Override
+    public String getTypeName() {
+      return IPFSStoragePluginConfig.NAME;
+    }
+
+    @Override
+    public Set<String> getTableNames() {
+      return Collections.emptySet();
+    }
+
+    @Override
+    public Table getTable(String tableName) {
+      //TODO: better handling of table names

Review comment:
       This is actually related to writer support. The initial design was to use a placeholder name for a yet-to-create table on IPFS, e.g. ``ipfs.`create` ``. Since the table names are hashes of the content, they cannot be known before they are created. I could delete this part of code, they don't do anything anyway.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446224224



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    ipfsHelper.setMaxPeersPerLeaf(config.getMaxNodesPerLeaf());
+    ipfsHelper.setTimeouts(config.getIpfsTimeouts());
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    LoadingCache<Multihash, IPFSPeer> peerMap = ipfsContext.getIPFSPeerCache();
+
+    try {
+      //TODO detect and warn about loops/recursions in a malformed tree
+      class IPFSTreeFlattener extends RecursiveTask<Map<Multihash, String>> {
+        private Multihash hash;
+        private boolean isProvider;
+        private Map<Multihash, String> ret = new LinkedHashMap<>();
+
+        public IPFSTreeFlattener(Multihash hash, boolean isProvider) {
+          this.hash = hash;
+          this.isProvider = isProvider;
+        }
+
+        @Override
+        public Map<Multihash, String> compute() {
+          try {
+            if (isProvider) {
+              IPFSPeer peer = peerMap.getUnchecked(hash);
+              ret.put(hash, peer.hasDrillbitAddress() ? peer.getDrillbitAddress().get() : null);
+              return ret;
+            }
+
+            MerkleNode metaOrSimpleNode = ipfsHelper.timedFailure(ipfsHelper.getClient().object::links, hash, config.getIpfsTimeout(FETCH_DATA));
+            if (metaOrSimpleNode.links.size() > 0) {
+              logger.debug("{} is a meta node", hash);
+              //TODO do something useful with leaf size, e.g. hint Drill about operation costs
+              List<Multihash> intermediates = metaOrSimpleNode.links.stream().map(x -> x.hash).collect(Collectors.toList());
+
+              ImmutableList.Builder<IPFSTreeFlattener> builder = ImmutableList.builder();
+              for (Multihash intermediate : intermediates.subList(1, intermediates.size())) {
+                builder.add(new IPFSTreeFlattener(intermediate, false));
+              }
+              ImmutableList<IPFSTreeFlattener> subtasks = builder.build();
+              subtasks.forEach(IPFSTreeFlattener::fork);
+
+              IPFSTreeFlattener first = new IPFSTreeFlattener(intermediates.get(0), false);
+              ret.putAll(first.compute());
+              subtasks.reverse().forEach(
+                  subtask -> ret.putAll(subtask.join())
+              );
+
+            } else {
+              logger.debug("{} is a simple node", hash);
+              List<IPFSPeer> providers = ipfsHelper.findprovsTimeout(hash).stream()
+                  .map(id ->
+                    peerMap.getUnchecked(id)
+                  )
+                  .collect(Collectors.toList());
+              //FIXME isDrillReady may block threads

Review comment:
       `isDrillReady` queries the IPFS DHT to check if a peer is also a running Drillbit, so that we can send query plans over to have the peer execute it. However, DHT queries are very time-consuming, so sometimes it appears to block the threads. I think it is desirable to add timeout control over the whole `IPFSTreeFlattener` execution.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446672603



##########
File path: contrib/storage-ipfs/src/test/java/org/apache/drill/exec/store/ipfs/TestIPFSQueries.java
##########
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.categories.IPFSStorageTest;
+import org.apache.drill.categories.SlowTest;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import static org.junit.Assert.fail;
+
+@Category({SlowTest.class, IPFSStorageTest.class})
+public class TestIPFSQueries extends IPFSTestBase {
+
+  @Test
+  public void testNullQuery() throws Exception {
+    testBuilder()
+        .sqlQuery(getSelectStar(IPFSHelper.IPFS_NULL_OBJECT))
+        .unOrdered()
+        .expectsNumRecords(1)

Review comment:
       Since we are running query against the null object, it's expected that the result set is empty. However, the test log file says it has one row, while the web interface clearly shows "no results". I changed this line to make the test pass, but I don't know what's going on here.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470080197



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       I saw this PR (https://github.com/ipfs-shipyard/java-ipfs-http-client/pull/172) was merged!  Can we:
   1.  Once there is a release with this PR merged, update the `pom.xml` so that we are using the "official" library.
   
   Should this now work will all versions of IPFS?  




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-650694408


   > @cgivre I've added more tests. The tests are not passing, something about `Error while applying rule DrillScanRule`. However, I was able to successfully execute the test queries through Drill web interface. I don't know how to fix these tests?
   > 
   > Edit: attach log file.
   > [org.apache.drill.exec.store.ipfs.TestIPFSQueries.txt](https://github.com/apache/drill/files/4840854/org.apache.drill.exec.store.ipfs.TestIPFSQueries.txt)
   
   It appears that the query is not getting through the planning phase.  My suggestion is to take a look at this tutorial about writing storage plugins:
   https://github.com/paul-rogers/drill/wiki/Storage-Plugin, and 
   and specifically, follow the debugging procedures that Paul outlines.  My hunch here is that something is going wrong with the  schema resolution. 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443752931



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);

Review comment:
       Removed in ca71f95.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-687702938


   The test failure looks irrelevant. 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470675247



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);

Review comment:
       Fixed in d2ea637.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443751291



##########
File path: contrib/storage-ipfs/pom.xml
##########
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>drill-contrib-parent</artifactId>
+        <groupId>org.apache.drill.contrib</groupId>
+        <version>1.18.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>drill-ipfs-storage</artifactId>
+    <name>contrib/ipfs-storage-plugin</name>
+    <version>0.1.0</version>
+    <properties>
+        <ipfs.TestSuite>**/IPFSTestSuit.class</ipfs.TestSuite>
+    </properties>
+
+    <repositories>

Review comment:
       Removed in 83ffce2.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452319560



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;
+  private final IPFSContext ipfsContext;
+  private final List<Multihash> ipfsSubScanSpecList;
+  private final IPFSScanSpec.Format format;
+  private final List<SchemaPath> columns;
+
+
+  @JsonCreator
+  public IPFSSubScan(@JacksonInject StoragePluginRegistry registry,
+                     @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                     @JsonProperty("IPFSSubScanSpec") @JsonDeserialize(using=MultihashDeserializer.class) List<Multihash> ipfsSubScanSpecList,
+                     @JsonProperty("format") IPFSScanSpec.Format format,
+                     @JsonProperty("columns") List<SchemaPath> columns
+                     ) throws ExecutionSetupException {
+    super((String) null);
+    IPFSStoragePlugin plugin = (IPFSStoragePlugin) registry.getPlugin(ipfsStoragePluginConfig);
+    ipfsContext = plugin.getIPFSContext();
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  public IPFSSubScan(IPFSContext ipfsContext, List<Multihash> ipfsSubScanSpecList, IPFSScanSpec.Format format, List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsSubScanSpecList = ipfsSubScanSpecList;
+    this.format = format;
+    this.columns = columns;
+  }
+
+  @JsonIgnore
+  public IPFSContext getIPFSContext() {
+    return ipfsContext;
+  }
+
+  @JsonProperty("IPFSStoragePluginConfig")
+  public IPFSStoragePluginConfig getIPFSStoragePluginConfig() {
+    return ipfsContext.getStoragePluginConfig();
+  }
+
+  @JsonProperty("columns")
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonProperty("format")
+  public IPFSScanSpec.Format getFormat() {
+    return format;
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsSubScanSpecList)
+        .field("format", format)
+        .field("columns", columns)
+        .toString();
+  }
+
+  @JsonSerialize(using = MultihashSerializer.class)
+  @JsonProperty("IPFSSubScanSpec")
+  public List<Multihash> getIPFSSubScanSpecList() {
+    return ipfsSubScanSpecList;
+  }
+
+  @Override
+  public <T, X, E extends Throwable> T accept(
+      PhysicalVisitor<T, X, E> physicalVisitor, X value) throws E {
+    return physicalVisitor.visitSubScan(this, value);
+  }
+
+  @Override
+  public Iterator<PhysicalOperator> iterator() {
+    return ImmutableSet.<PhysicalOperator>of().iterator();
+  }
+
+  @Override
+  public int getOperatorType() {
+    return IPFS_SUB_SCAN_VALUE;
+  }
+
+  @Override
+  public boolean isExecutable() {
+    return false;
+  }
+
+  @Override
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    return new IPFSSubScan(ipfsContext, ipfsSubScanSpecList, format, columns);
+  }
+
+  public static class IPFSSubScanSpec {
+    private final String targetHash;
+
+    @JsonCreator
+    public IPFSSubScanSpec(@JsonProperty("targetHash") String targetHash) {
+      this.targetHash = targetHash;
+    }
+
+    @JsonProperty
+    public String getTargetHash() {
+      return targetHash;
+    }
+  }
+
+  static class MultihashSerializer extends JsonSerializer<List<Multihash>> {
+
+    @Override
+    public void serialize(List<Multihash> value, JsonGenerator jgen,
+                          SerializerProvider provider) throws IOException, JsonProcessingException {

Review comment:
       Fixed in 0f9c2db.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSchemaFactory.java
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.schema.Table;
+import org.apache.drill.exec.planner.logical.DynamicDrillTable;
+import org.apache.drill.exec.store.AbstractSchema;
+import org.apache.drill.exec.store.SchemaConfig;
+import org.apache.drill.exec.store.SchemaFactory;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+
+public class IPFSSchemaFactory implements SchemaFactory{
+  private static final Logger logger = LoggerFactory.getLogger(IPFSSchemaFactory.class);
+
+  final String schemaName;
+  final IPFSContext context;
+
+  public IPFSSchemaFactory(IPFSContext context, String name) throws IOException {

Review comment:
       Fixed in 0f9c2db.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470025016



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  public class DHT {
+    /**
+     * Find internet addresses of a given peer.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findpeer">dht/findpeer in IPFS doc</a>.
+     * @param id the id of the peer to query
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multiaddresses of the peer
+     */
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>();
+      timeLimitedExec(
+          "name/resolve?arg=" + id,
+          timeout,
+          res -> {
+            Map peer = (Map) res;

Review comment:
       Made some changes in 39bab37.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470085822



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);
+    }
+  }
+
+  Map<Multihash, String> getLeafAddrMappings(Multihash topHash) {
+    logger.debug("start to recursively expand nested IPFS hashes, topHash={}", topHash);
+    Stopwatch watch = Stopwatch.createStarted();
+    ForkJoinPool forkJoinPool = new ForkJoinPool(config.getNumWorkerThreads());
+    IPFSTreeFlattener topTask = new IPFSTreeFlattener(topHash, false, ipfsContext);
+    Map<Multihash, String> leafAddrMap = forkJoinPool.invoke(topTask);
+    logger.debug("Took {} ms to expand hash leaves", watch.elapsed(TimeUnit.MILLISECONDS));
+
+    return leafAddrMap;
+  }
+
+  private IPFSGroupScan(IPFSGroupScan that) {
+    super(that);
+    this.ipfsContext = that.ipfsContext;
+    this.ipfsScanSpec = that.ipfsScanSpec;
+    this.config = that.config;
+    this.assignments = that.assignments;
+    this.ipfsWorkList = that.ipfsWorkList;
+    this.endpointWorksMap = that.endpointWorksMap;
+    this.columns = that.columns;
+  }
+
+  @JsonProperty
+  public List<SchemaPath> getColumns() {
+    return columns;
+  }
+
+  @JsonIgnore
+  public IPFSStoragePlugin getStoragePlugin() {
+    return ipfsContext.getStoragePlugin();
+  }
+
+  @JsonProperty
+  public IPFSScanSpec getIPFSScanSpec() {
+    return ipfsScanSpec;
+  }
+
+  @Override
+  public List<EndpointAffinity> getOperatorAffinity() {
+    if (affinities == null) {
+      affinities = AffinityCreator.getAffinityMap(ipfsWorkList);
+    }
+    return affinities;
+  }
+
+  @Override
+  public int getMaxParallelizationWidth() {
+    DrillbitEndpoint myself = ipfsContext.getStoragePlugin().getContext().getEndpoint();
+    int width;
+    if (endpointWorksMap.containsKey(myself.getAddress())) {
+      // the foreman is also going to be a minor fragment worker under a UnionExchange operator
+      width = ipfsWorkList.size();
+    } else {
+      // the foreman does not hold data, so we have to force parallelization
+      // to make sure there is a UnionExchange operator
+      width = ipfsWorkList.size() + 1;
+    }
+    logger.debug("getMaxParallelizationWidth: {}", width);
+    return width;
+  }
+
+  @Override
+  public void applyAssignments(List<DrillbitEndpoint> incomingEndpoints) {
+    logger.debug("ipfsWorkList.size() = {}", ipfsWorkList.size());
+    logger.debug("endpointWorksMap: {}", endpointWorksMap);
+    if (endpointWorksMap.size() > 1) {
+      logger.debug("Use manual assignment");
+      assignments = ArrayListMultimap.create();
+      for (int fragmentId = 0; fragmentId < incomingEndpoints.size(); fragmentId++) {
+        String address = incomingEndpoints.get(fragmentId).getAddress();
+        if (endpointWorksMap.containsKey(address)) {
+          for (IPFSWork work : endpointWorksMap.get(address)) {
+            assignments.put(fragmentId, work);
+          }
+        }
+      }
+    } else {
+      logger.debug("Use AssignmentCreator");
+      assignments = AssignmentCreator.getMappings(incomingEndpoints, ipfsWorkList);
+    }
+  }
+
+  @Override
+  public IPFSSubScan getSpecificScan(int minorFragmentId) {
+    logger.debug(String.format("getSpecificScan: minorFragmentId = %d", minorFragmentId));
+    List<IPFSWork> workList = assignments.get(minorFragmentId);
+    List<Multihash> scanSpecList = Lists.newArrayList();
+    if (workList != null) {
+      logger.debug("workList.size(): {}", workList.size());
+
+      for (IPFSWork work : workList) {
+        scanSpecList.add(work.getPartialRootHash());
+      }
+    }
+
+    return new IPFSSubScan(ipfsContext, scanSpecList, ipfsScanSpec.getFormatExtension(), columns);
+  }
+
+  @Override
+  public ScanStats getScanStats() {
+    long recordCount = 100000 * endpointWorksMap.size();
+    return new ScanStats(ScanStats.GroupScanProperty.NO_EXACT_ROW_COUNT, recordCount, 1, recordCount);
+  }
+
+  @Override
+  public IPFSGroupScan clone(List<SchemaPath> columns){
+    logger.debug("IPFSGroupScan clone {}", columns);
+    IPFSGroupScan cloned = new IPFSGroupScan(this);
+    cloned.columns = columns;
+    return cloned;
+  }
+
+  @Override
+  @JsonIgnore
+  public boolean canPushdownProjects(List<SchemaPath> columns) {
+    return true;
+  }
+
+  @Override
+  @JsonIgnore
+  public PhysicalOperator getNewWithChildren(List<PhysicalOperator> children) {
+    Preconditions.checkArgument(children.isEmpty());
+    logger.debug("getNewWithChildren called");
+    return new IPFSGroupScan(this);
+  }
+
+  @Override
+  public String getDigest() {
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+        .field("scan spec", ipfsScanSpec)
+        .field("columns", columns)
+        .toString();
+  }
+
+  private static class IPFSWork implements CompleteWork {
+    private final EndpointByteMapImpl byteMap = new EndpointByteMapImpl();
+    private final Multihash partialRoot;
+    private DrillbitEndpoint onEndpoint = null;
+
+
+    public IPFSWork(String root) {
+      this.partialRoot = Multihash.fromBase58(root);
+    }
+
+    public IPFSWork(Multihash root) {
+      this.partialRoot = root;
+    }
+
+    public Multihash getPartialRootHash() {return partialRoot;}
+
+    public void setOnEndpoint(DrillbitEndpoint endpointAddress) {
+      this.onEndpoint = endpointAddress;
+    }
+
+    @Override
+    public long getTotalBytes() {
+      return DEFAULT_NODE_SIZE;
+    }
+
+    @Override
+    public EndpointByteMap getByteMap() {
+      return byteMap;
+    }
+
+    @Override
+    public int compareTo(CompleteWork o) {
+      return 0;
+    }
+
+    @Override
+    public String toString() {
+      return "IPFSWork [root = " + partialRoot.toString() + "]";

Review comment:
       Please use the `PlanStringBuilder` for any `toString()` functions that are in serialized objects.  This will make your life easier and code cleaner. ;-)
   
   https://github.com/apache/drill/blob/0726b83d9347cbb8bd1bc64a8d10c12c1125549a/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpGroupScan.java#L228-L235




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673543986


   > `TestIPFQueries` fails the checkstyle due to unused imports.
   @cgivre Hmm I don't see any unused imports in this file and my builds are passing.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-736568269


   @dbw9580 
   Hi there!  I hope all is well.  Are you still interested in completing this PR?  


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443754475



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isLoopbackAddress()) {
+          continue;
+        }
+        if (inetAddress.isSiteLocalAddress() || inetAddress.isLinkLocalAddress()) {
+          //FIXME we don't know which local address can be reached; maybe check with InetAddress.isReachable?
+          localAddr = host;
+        } else {
+          return Optional.of(host);
+        }
+      } catch (UnknownHostException e) {
+        continue;
+      }
+    }
+
+    return Optional.ofNullable(localAddr);
+  }
+
+  public Optional<String> getPeerDrillHostname(Multihash peerId) {
+    return getPeerData(peerId, "drill-hostname").map(Strings::fromByteArray);
+  }
+
+  public boolean isDrillReady(Multihash peerId) {
+    try {
+      return getPeerData(peerId, "drill-ready").isPresent();
+    } catch (RuntimeException e) {
+      return false;
+    }
+  }
+
+  public Optional<Multihash> getIPNSDataHash(Multihash peerId) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()
+        .filter(l -> l.name.equals(Optional.of("drill-data")))
+        .findFirst()
+        .map(l -> l.hash);
+  }
+
+
+  private Optional<byte[]> getPeerData(Multihash peerId, String key) {
+    Optional<List<MerkleNode>> links = getPeerLinks(peerId);
+    if (!links.isPresent()) {
+      return Optional.empty();
+    }
+
+    return links.get().stream()
+        .filter(l -> l.name.equals(Optional.of(key)))
+        .findFirst()
+        .map(l -> {
+          try {
+            return client.object.data(l.hash);
+          } catch (IOException e) {
+            return null;
+          }
+        });
+  }
+
+  private Optional<List<MerkleNode>> getPeerLinks(Multihash peerId) {
+    try {
+      Optional<String> optionalPath = client.name.resolve(peerId, 30);
+      if (!optionalPath.isPresent()) {
+        return Optional.empty();
+      }
+      String path = optionalPath.get().substring(6); // path starts with /ipfs/Qm...
+
+      List<MerkleNode> links = client.object.get(Multihash.fromBase58(path)).links;
+      if (links.size() < 1) {

Review comment:
       Changed in ca71f95.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470035068



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private IPFSContext ipfsContext;
+  private IPFSScanSpec ipfsScanSpec;
+  private IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static long DEFAULT_NODE_SIZE = 1000l;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) throws IOException, ExecutionSetupException {
+    this(
+        ((IPFSStoragePlugin) pluginRegistry.getPlugin(ipfsStoragePluginConfig)).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)

Review comment:
       Fixed in d7ee62c.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-646111328


   @dbw9580 
   Why don't you break out CSV and the writer support as separate PRs.  Then we can get this in, and work on those for the next release. 
   
   I've never done a writer, but I can assist with the CSV reader.   Take a look here:
   https://github.com/apache/drill/blob/master/contrib/storage-http/src/main/java/org/apache/drill/exec/store/http/HttpCSVBatchReader.java
   
   The HTTP storage plugin can read either `JSON` or `CSV`.  I attempted to use Drill's built in CSV reader but I would have had to do a lot of work on the CSV reader to get it to work... So... just used this simple version.  


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r467971039



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ *
+ * Supports IPFS up to version v0.4.23, due to new restrictions enforcing all API calls to be made with POST method.
+ * Upstream issue tracker: https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157
+ */

Review comment:
       > Drill can't query IPFS version > 0.4.2 due to library restrictions. We can't simply upgrade the library because it requires Java 11 and Drill is built on Java 8. Is that correct?
   
   Yes, `java-ipfs-http-client` v1.2.3, the last version which requires Java 8, supports IPFS up to version 0.4.23, the last release before version 0.5 which introduced the incompatibility in https://github.com/ipfs-shipyard/java-ipfs-http-client/issues/157. The latest library version v.1.3.2 supports IPFS v0.5+ but requires Java 11.
   
   >How criticial would you say this is for functionality?
   
   I'm not sure how many users of IPFS have upgraded to v0.5+, but users can [downgrade to a previous version of IPFS](https://github.com/ipfs/ipfs-update#revert) if they want to run Drill with IPFS support for the time being. Newer IPFS versions bring performance improvements, which could help Drill do queries faster, but the basic functionalities should be the same.
   
   > Is there some workaround possible so that Drill will work with the latest IPFS version?
   
   At a first glance the `java-ipfs-http-client` lib seems to be using some features from Java 11, but only in tests. We could fork the library and revert the Java target version to 8 and ignore those tests. I need to investigate more about this to see if it's really a solution.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443752670



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);

Review comment:
       Refactored in ca71f95.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470091789



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSScanSpec.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+@JsonTypeName("IPFSScanSpec")
+public class IPFSScanSpec {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSScanSpec.class);
+
+  public enum Prefix {
+    @JsonProperty("ipfs")
+    IPFS("ipfs"),
+    @JsonProperty("ipns")
+    IPNS("ipns");
+
+    @JsonProperty("prefix")
+    private final String name;
+    Prefix(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Prefix of(String what) {
+      switch (what) {
+        case "ipfs" :
+          return IPFS;
+        case "ipns":
+          return IPNS;
+        default:
+          throw new InvalidParameterException("Unsupported prefix: " + what);
+      }
+    }
+  }
+
+  public enum Format {
+    @JsonProperty("json")
+    JSON("json"),
+    @JsonProperty("csv")
+    CSV("csv");
+
+    @JsonProperty("format")
+    private final String name;
+    Format(String prefix) {
+      this.name = prefix;
+    }
+
+    @Override
+    public String toString() {
+      return this.name;
+    }
+
+    @JsonCreator
+    public static Format of(String what) {
+      switch (what) {
+        case "json" :
+          return JSON;
+        case "csv":
+          return CSV;
+        default:
+          throw new InvalidParameterException("Unsupported format: " + what);
+      }
+    }
+  }
+
+  public static Set<String> formats = ImmutableSet.of("json", "csv");
+  private Prefix prefix;
+  private String path;
+  private Format formatExtension;
+  private final IPFSContext ipfsContext;
+
+  @JsonCreator
+  public IPFSScanSpec (@JacksonInject StoragePluginRegistry registry,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("prefix") Prefix prefix,
+                       @JsonProperty("format") Format format,
+                       @JsonProperty("path") String path) {
+    this.ipfsContext = registry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext();
+    this.prefix = prefix;
+    this.formatExtension = format;
+    this.path = path;
+  }
+
+  public IPFSScanSpec (IPFSContext ipfsContext, String path) {
+    this.ipfsContext = ipfsContext;
+    parsePath(path);
+  }
+
+  private void parsePath(String path) {
+    //FIXME: IPFS hashes are actually Base58 encoded, so "0" "O" "I" "l" are not valid
+    //also CIDs can be encoded with different encodings, not necessarily Base58
+    Pattern tableNamePattern = Pattern.compile("^/(ipfs|ipns)/([A-Za-z0-9]{46}(/[^#]+)*)(?:#(\\w+))?$");
+    Matcher matcher = tableNamePattern.matcher(path);
+    if (!matcher.matches()) {
+      throw UserException.validationError().message("Invalid IPFS path in query string. Use paths of pattern `/scheme/hashpath#format`, where scheme:= \"ipfs\"|\"ipns\", hashpath:= HASH [\"/\" path], HASH is IPFS Base58 encoded hash, path:= TEXT [\"/\" path], format:= \"json\"|\"csv\"").build(logger);

Review comment:
       Please break this up into multiple lines.  Here and elsewhere.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.base.Objects;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Maps;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    public static final String NAME = "ipfs";
+
+    @JsonProperty
+    private final String host;
+
+    @JsonProperty
+    private final int port;
+
+    @JsonProperty("max-nodes-per-leaf")
+    private final int maxNodesPerLeaf;
+
+    @JsonProperty("ipfs-timeouts")
+    private final Map<IPFSTimeOut, Integer> ipfsTimeouts;
+
+    @JsonIgnore
+    private static final Map<IPFSTimeOut, Integer> ipfsTimeoutDefaults = ImmutableMap.of(
+        IPFSTimeOut.FIND_PROV, 4,
+        IPFSTimeOut.FIND_PEER_INFO, 4,
+        IPFSTimeOut.FETCH_DATA, 6
+    );
+
+    public enum IPFSTimeOut {
+        @JsonProperty("find-provider")
+        FIND_PROV("find-provider"),
+        @JsonProperty("find-peer-info")
+        FIND_PEER_INFO("find-peer-info"),
+        @JsonProperty("fetch-data")
+        FETCH_DATA("fetch-data");
+
+        @JsonProperty("type")
+        private final String which;
+        IPFSTimeOut(String which) {
+            this.which = which;
+        }
+
+        @JsonCreator
+        public static IPFSTimeOut of(String which) {
+            switch (which) {
+                case "find-provider":
+                    return FIND_PROV;
+                case "find-peer-info":
+                    return FIND_PEER_INFO;
+                case "fetch-data":
+                    return FETCH_DATA;
+                default:
+                    throw new InvalidParameterException("Unknown key for IPFS timeout config entry: " + which);
+            }
+        }
+
+        @Override
+        public String toString() {
+            return this.which;
+        }
+    }
+
+    @JsonProperty("groupscan-worker-threads")
+    private final int numWorkerThreads;
+
+    @JsonProperty
+    private final Map<String, FormatPluginConfig> formats;
+
+    @JsonCreator
+    public IPFSStoragePluginConfig(
+        @JsonProperty("host") String host,
+        @JsonProperty("port") int port,
+        @JsonProperty("max-nodes-per-leaf") int maxNodesPerLeaf,
+        @JsonProperty("ipfs-timeouts") Map<IPFSTimeOut, Integer> ipfsTimeouts,
+        @JsonProperty("groupscan-worker-threads") int numWorkerThreads,
+        @JsonProperty("formats") Map<String, FormatPluginConfig> formats) {
+        this.host = host;
+        this.port = port;
+        this.maxNodesPerLeaf = maxNodesPerLeaf > 0 ? maxNodesPerLeaf : 1;
+        if (ipfsTimeouts != null) {
+            this.ipfsTimeouts = Maps.newHashMap();
+            ipfsTimeouts.forEach(this.ipfsTimeouts::put);
+            ipfsTimeoutDefaults.forEach(this.ipfsTimeouts::putIfAbsent);
+        } else {
+            this.ipfsTimeouts = ipfsTimeoutDefaults;
+        }
+        this.numWorkerThreads = numWorkerThreads > 0 ? numWorkerThreads : 1;
+        this.formats = formats;
+    }
+
+    public String getHost() {
+        return host;
+    }
+
+    public int getPort() {
+        return port;
+    }
+
+    public int getMaxNodesPerLeaf() {
+        return maxNodesPerLeaf;
+    }
+
+    public int getIpfsTimeout(IPFSTimeOut which) {
+        return ipfsTimeouts.get(which);
+    }
+
+    public Map<IPFSTimeOut, Integer> getIpfsTimeouts() {
+        return ipfsTimeouts;
+    }
+
+    public int getNumWorkerThreads() {
+        return numWorkerThreads;
+    }
+
+    public Map<String, FormatPluginConfig> getFormats() {
+        return formats;
+    }
+
+    @Override
+    public int hashCode() {

Review comment:
       Is there a reason for writing this method this way?  I'd suggest writing the `hashCode()` method as shown below, unless there is a compelling reason not to. 
   
   https://github.com/apache/drill/blob/0726b83d9347cbb8bd1bc64a8d10c12c1125549a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelFormatConfig.java#L106-L110




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673976246


   @cgivre Does Drill support connections from IPv6 sockets? Is it enabled by default or do I have to toggle some configuration items? The "protocol family unavailable" error could be due to lack of support for IPv6.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432972374



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();
+    } catch (TimeoutException e) {
+      throw UserException.executionError(e).message("IPFS operation timed out").build(logger);
+    } catch (CancellationException | InterruptedException e) {
+      throw UserException.executionError(e).build(logger);
+    }
+  }
+
+  public static Optional<String> pickPeerHost(List<MultiAddress> peerAddrs) {
+    String localAddr = null;
+    for (MultiAddress addr : peerAddrs) {
+      String host = addr.getHost();
+      try {
+        InetAddress inetAddress = InetAddress.getByName(host);
+        if (inetAddress.isLoopbackAddress()) {

Review comment:
       Unnecessary block




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470673620



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);
+        }
+
+        IPFSWork work = new IPFSWork(leaf.toBase58());
+        logger.debug("added endpoint {} to work {}", ep.getAddress(), work);
+        work.getByteMap().add(ep, DEFAULT_NODE_SIZE);
+        work.setOnEndpoint(ep);
+
+        if(endpointWorksMap.containsKey(ep.getAddress())) {
+          endpointWorksMap.get(ep.getAddress()).add(work);
+        } else {
+          List<IPFSWork> ipfsWorks = Lists.newArrayList();
+          ipfsWorks.add(work);
+          endpointWorksMap.put(ep.getAddress(), ipfsWorks);
+        }
+        ipfsWorkList.add(work);
+      }
+    }catch (Exception e) {
+      logger.debug("exception in init");
+      throw new RuntimeException(e);

Review comment:
       Fixed in d2ea637.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443907953



##########
File path: common/src/test/java/org/apache/drill/categories/IPFSStorageTest.java
##########
@@ -0,0 +1,8 @@
+package org.apache.drill.categories;
+
+/**
+ * This is a category used to mark unit tests that test the Hive storage plugin.

Review comment:
       Please fix or remove comment.  




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470673442



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?

Review comment:
       This is now DRILL-7777.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-670308917


   @dbw9580 
   I'll take a look over the weekend.  Thanks for the contribution!


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452320526



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSStoragePluginConfig.java
##########
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.common.logical.StoragePluginConfigBase;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.InvalidParameterException;
+import java.util.Map;
+
+@JsonTypeName(IPFSStoragePluginConfig.NAME)
+public class IPFSStoragePluginConfig extends StoragePluginConfigBase{
+    private static final Logger logger = LoggerFactory.getLogger(IPFSStoragePluginConfig.class);

Review comment:
       Fixed in 48d2058.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r471575915



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       @dbw9580, could you please explain why it is required to register Drillbit endpoint? It is prohibited to do it everywhere except for the place when Drillbit is starting. When the endpoint is registered, it may be misused when executing other queries. Also, the same node may run several group scans, so it will fail for this case because required ports will be used.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] sanel commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
sanel commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r432972326



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.bouncycastle.util.Strings;
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+
+
+public class IPFSHelper {
+  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private WeakReference<ExecutorService> executorService;
+  private static ExecutorService DEFAULT_EXECUTOR = Executors.newSingleThreadExecutor();
+  private IPFS client;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  class DefaultWeakReference<T> extends WeakReference<T> {
+    private T default_;
+    public DefaultWeakReference(T referent, T default_) {
+      super(referent);
+      this.default_ = default_;
+    }
+
+    @Override
+    public T get() {
+      T ret = super.get();
+      if (ret == null) {
+        return default_;
+      } else {
+        return ret;
+      }
+    }
+  }
+
+  public IPFSHelper(IPFS ipfs) {
+    executorService = new DefaultWeakReference<>(DEFAULT_EXECUTOR, DEFAULT_EXECUTOR);
+    this.client = ipfs;
+  }
+
+  public void setExecutorService(ExecutorService executorService) {
+    this.executorService = new DefaultWeakReference<>(executorService, DEFAULT_EXECUTOR);
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) throws IOException {
+    List<String> providers;
+    providers = client.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService.get());
+
+    List<Multihash> ret = providers.stream().map(str -> Multihash.fromBase58(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) throws IOException {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = client.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService.get());
+    List<MultiAddress>
+        ret = addrs
+        .stream()
+        .filter(addr -> !addr.equals(""))
+        .map(str -> new MultiAddress(str)).collect(Collectors.toList());
+    return ret;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Throws TimeoutException, so the
+   * caller has a chance to recover from a timeout.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T>
+   * @param <R>
+   * @param <E>
+   * @return R the result of the operation
+   * @throws TimeoutException
+   * @throws E
+   */
+  public <T, R, E extends Exception> R timed(ThrowingFunction<T, R, E> op, T in, int timeout) throws TimeoutException, E {
+    Callable<R> task = () -> op.apply(in);
+    Future<R> res = executorService.get().submit(task);
+    try {
+      return res.get(timeout, TimeUnit.SECONDS);

Review comment:
       similar code to `timedFailure()`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-647863424


   > some tests that do not require the IPFS daemon
   
   I guess for the storage plugin config, scanSpec, etc, they are static and independent on a running IPFS daemon. Also for the json reader,  I think I can supply test data from memory directly, w/o actually retrieving it from IPFS. 
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-646834496


   @dbw9580 
   I had another thought about the design.  I wonder if instead of "rolling your own" readers, if you could do what the file plugin does and make use of all the existing format plugins.  Basically, that way any new file format for which there is a format plugin could also be read by IPFS.  I don't know IPFS that well to know if that would even make sense, or not.
   
   That also might simplify the code a lot. 
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-674541877


   Hi @dbw9580 
   Thanks for these updates.  I didn't have any issues running your unit tests before this.  However, I took a look at the Maven docs, and I'm wondering if you can specify the number of forks directly in the `pom.xml` file. [1]
   
   [1]: https://maven.apache.org/surefire/maven-surefire-plugin/examples/fork-options-and-parallel-execution.html
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: [WIP] DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443756373



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSJSONRecordReader.java
##########
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.core.JsonParseException;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.ops.FragmentContext;
+import org.apache.drill.exec.ops.OperatorContext;
+import org.apache.drill.exec.physical.impl.OutputMutator;
+import org.apache.drill.exec.store.AbstractRecordReader;
+import org.apache.drill.exec.store.easy.json.JsonProcessor;
+import org.apache.drill.exec.store.easy.json.reader.CountingJsonReader;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.exec.vector.BaseValueVector;
+import org.apache.drill.exec.vector.complex.fn.JsonReader;
+import org.apache.drill.exec.vector.complex.impl.VectorContainerWriter;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+public class IPFSJSONRecordReader extends AbstractRecordReader {
+  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(IPFSJSONRecordReader.class);
+
+  public static final long DEFAULT_ROWS_PER_BATCH = BaseValueVector.INITIAL_VALUE_ALLOCATION;
+
+  private FragmentContext fragmentContext;
+  private IPFSContext ipfsContext;
+  private String subScanSpec;
+  private List<SchemaPath> columnList;
+  private JsonProcessor jsonReader;
+  private InputStream stream;
+  private int recordCount;
+  private long runningRecordCount = 0;
+  private final boolean enableAllTextMode;
+  private final boolean enableNanInf;
+  private final boolean enableEscapeAnyChar;
+  private final boolean readNumbersAsDouble;
+  private final boolean unionEnabled;
+  private long parseErrorCount;
+  private final boolean skipMalformedJSONRecords;
+  private final boolean printSkippedMalformedJSONRecordLineNumber;
+  private JsonProcessor.ReadState write = null;
+  private VectorContainerWriter writer;
+
+  public IPFSJSONRecordReader(FragmentContext fragmentContext, IPFSContext ipfsContext, String scanSpec, List<SchemaPath> columns) {
+    this.fragmentContext = fragmentContext;
+    this.ipfsContext = ipfsContext;
+    this.subScanSpec = scanSpec;
+    this.columnList = columns;
+    setColumns(columns);
+    this.fragmentContext = fragmentContext;
+    // only enable all text mode if we aren't using embedded content mode.
+    this.enableAllTextMode = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR);
+    this.enableEscapeAnyChar = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ESCAPE_ANY_CHAR_VALIDATOR);
+    this.enableNanInf = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_NAN_INF_NUMBERS_VALIDATOR);
+    this.readNumbersAsDouble = fragmentContext.getOptions().getOption(ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR);
+    this.unionEnabled = fragmentContext.getOptions().getBoolean(ExecConstants.ENABLE_UNION_TYPE_KEY);
+    this.skipMalformedJSONRecords = fragmentContext.getOptions().getOption(ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR);
+    this.printSkippedMalformedJSONRecordLineNumber = fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_PRINT_INVALID_RECORDS_LINE_NOS_FLAG_VALIDATOR);
+
+  }
+
+  @Override
+  public String toString() {
+    return super.toString()
+        + ", recordCount = " + recordCount
+        + ", parseErrorCount = " + parseErrorCount
+        + ", runningRecordCount = " + runningRecordCount + ", ...]";
+  }
+
+  @Override
+  public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {

Review comment:
       Refactored in 97b4a7d

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSPeer.java
##########
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+
+public class IPFSPeer {
+  private IPFSHelper helper;
+
+  private Multihash id;
+  private List<MultiAddress> addrs;
+  private boolean isDrillReady;
+  private boolean isDrillReadyChecked = false;
+  private Optional<String> drillbitAddress = Optional.empty();
+  private boolean drillbitAddressChecked = false;
+
+
+  public IPFSPeer(IPFSHelper helper, Multihash id) {
+    this.helper = helper;
+    this.id = id;
+  }
+
+  IPFSPeer(IPFSHelper helper, Multihash id, List<MultiAddress> addrs) {
+    this.helper = helper;
+    this.id = id;
+    this.addrs = addrs;
+    this.isDrillReady = helper.isDrillReady(id);
+    this.isDrillReadyChecked = true;
+    this.drillbitAddress = IPFSHelper.pickPeerHost(addrs);
+    this.drillbitAddressChecked = true;
+  }
+
+  public boolean isDrillReady() {
+    if (!isDrillReadyChecked) {
+      isDrillReady = helper.isDrillReady(id);
+      isDrillReadyChecked = true;
+    }
+    return isDrillReady;
+  }
+
+  public boolean hasDrillbitAddress() {

Review comment:
       Fixed in 160a909.

##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSPeer.java
##########
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+
+public class IPFSPeer {
+  private IPFSHelper helper;
+
+  private Multihash id;
+  private List<MultiAddress> addrs;
+  private boolean isDrillReady;
+  private boolean isDrillReadyChecked = false;
+  private Optional<String> drillbitAddress = Optional.empty();
+  private boolean drillbitAddressChecked = false;
+
+
+  public IPFSPeer(IPFSHelper helper, Multihash id) {
+    this.helper = helper;
+    this.id = id;
+  }
+
+  IPFSPeer(IPFSHelper helper, Multihash id, List<MultiAddress> addrs) {
+    this.helper = helper;
+    this.id = id;
+    this.addrs = addrs;
+    this.isDrillReady = helper.isDrillReady(id);
+    this.isDrillReadyChecked = true;
+    this.drillbitAddress = IPFSHelper.pickPeerHost(addrs);
+    this.drillbitAddressChecked = true;
+  }
+
+  public boolean isDrillReady() {
+    if (!isDrillReadyChecked) {
+      isDrillReady = helper.isDrillReady(id);
+      isDrillReadyChecked = true;
+    }
+    return isDrillReady;
+  }
+
+  public boolean hasDrillbitAddress() {
+    findDrillbitAddress();
+    return drillbitAddress.isPresent();
+  }
+
+  public Optional<String> getDrillbitAddress() {
+    findDrillbitAddress();
+    return drillbitAddress;
+  }
+
+  public List<MultiAddress> getMultiAddresses() {
+    findDrillbitAddress();
+    return addrs;
+  }
+
+  public Multihash getId() {
+    return id;
+  }
+
+
+  private void findDrillbitAddress() {
+    if (!drillbitAddressChecked) {

Review comment:
       Changed in 160a909. The `IOException` is no longer relevant since `IPFSCompat` is refactored.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470083089



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,463 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  private static final int DEFAULT_USER_PORT = 31010;
+  private static final int DEFAULT_CONTROL_PORT = 31011;
+  private static final int DEFAULT_DATA_PORT = 31012;
+  private static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private Map<String, List<IPFSWork>> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = new HashMap<>();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //TODO how to safely remove endpoints that are no longer needed once the query is completed?

Review comment:
       @dbw9580 
   Please remove `TODO`s or put a link to a JIRA.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470673138



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSCompat.java
##########
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.JSONParser;
+import io.ipfs.multihash.Multihash;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+/**
+ * Compatibility fixes for java-ipfs-http-client library
+ */
+public class IPFSCompat {
+  public final String host;
+  public final int port;
+  private final String version;
+  public final String protocol;
+  public final int readTimeout;
+  public static final int DEFAULT_READ_TIMEOUT = 0;
+
+  public final DHT dht = new DHT();
+  public final Name name = new Name();
+
+  public IPFSCompat(IPFS ipfs) {
+    this(ipfs.host, ipfs.port);
+  }
+
+  public IPFSCompat(String host, int port) {
+    this(host, port, "/api/v0/", false, DEFAULT_READ_TIMEOUT);
+  }
+
+  public IPFSCompat(String host, int port, String version, boolean ssl, int readTimeout) {
+    this.host = host;
+    this.port = port;
+
+    if(ssl) {
+      this.protocol = "https";
+    } else {
+      this.protocol = "http";
+    }
+
+    this.version = version;
+    this.readTimeout = readTimeout;
+  }
+
+  /**
+   * Resolve names to IPFS CIDs.
+   * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-resolve">resolve in IPFS doc</a>.
+   * @param scheme the scheme of the name to resolve, usually IPFS or IPNS
+   * @param path the path to the object
+   * @param recursive whether recursively resolve names until it is a IPFS CID
+   * @return a Map of JSON object, with the result as the value of key "Path"
+   */
+  public Map resolve(String scheme, String path, boolean recursive) {
+    AtomicReference<Map> ret = new AtomicReference<>();
+    getObjectStream(
+        "resolve?arg=/" + scheme+"/"+path +"&r="+recursive,
+        res -> {
+          ret.set((Map) res);
+          return true;
+        },
+        err -> {
+          throw new RuntimeException(err);
+        }
+    );
+    return ret.get();
+  }
+
+  /**
+   * As defined in https://github.com/libp2p/go-libp2p-core/blob/b77fd280f2bfcce22f10a000e8e1d9ec53c47049/routing/query.go#L16
+   */
+  public enum DHTQueryEventType {
+    // Sending a query to a peer.
+    SendingQuery,
+    // Got a response from a peer.
+    PeerResponse,
+    // Found a "closest" peer (not currently used).
+    FinalPeer,
+    // Got an error when querying.
+    QueryError,
+    // Found a provider.
+    Provider,
+    // Found a value.
+    Value,
+    // Adding a peer to the query.
+    AddingPeer,
+    // Dialing a peer.
+    DialingPeer;
+  }
+
+  public class DHT {
+    /**
+     * Find internet addresses of a given peer.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findpeer">dht/findpeer in IPFS doc</a>.
+     * @param id the id of the peer to query
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multiaddresses of the peer
+     */
+    public List<String> findpeerListTimeout(Multihash id, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>();
+      timeLimitedExec(
+          "name/resolve?arg=" + id,
+          timeout,
+          res -> {
+            Map peer = (Map) res;
+            if (peer == null) {
+              return false;
+            }
+            if ( (int) peer.get("Type") != DHTQueryEventType.FinalPeer.ordinal() ) {
+              return false;
+            }
+            List<Map> responses = (List<Map>) peer.get("Responses");
+            if (responses == null || responses.size() == 0) {
+              return false;
+            }
+            // FinalPeer responses have exactly one response
+            Map<String, List<String>> response = responses.get(0);
+            if (response == null) {
+              return false;
+            }
+            List<String> addrs = response.get("Addrs");
+
+            ret.set(addrs);
+            return true;
+          },
+          err -> {
+            if (!(err instanceof TimeoutException)) {
+              throw new RuntimeException(err);
+            }
+          },
+          executor
+      );
+      if (ret.get().size() > 0) {
+        return ret.get();
+      } else {
+        return Collections.emptyList();
+      }
+    }
+
+    /**
+     * Find providers of a given CID.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-dht-findprovs">dht/findprovs in IPFS doc</a>.
+     * @param id the CID of the IPFS object
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return List of Multihash of providers of the object
+     */
+    public List<String> findprovsListTimeout(Multihash id, int maxPeers, int timeout, ExecutorService executor) {
+      AtomicReference<List<String>> ret = new AtomicReference<>(new ArrayList<>());
+      timeLimitedExec(
+          "dht/findprovs?arg=" + id + "&n=" + maxPeers,
+          timeout,
+          res -> {
+            Map peer = (Map) res;
+            if ( peer == null ) {
+              return false;
+            }
+            if ( (int) peer.get("Type") != DHTQueryEventType.Provider.ordinal() ) {
+              return false;
+            }
+            List<Map> responses = (List<Map>) peer.get("Responses");
+            if (responses == null || responses.size() == 0) {
+              return false;
+            }
+            // One Provider message contains only one provider
+            Map<String, String> response = responses.get(0);
+            if (response == null) {
+              return false;
+            }
+            String providerID = response.get("ID");
+
+            ret.get().add(providerID);
+            return ret.get().size() >= maxPeers;
+          },
+          err -> {
+            if (!(err instanceof TimeoutException)) {
+              throw new RuntimeException(err);
+            }
+          },
+          executor
+      );
+      if (ret.get().size() > 0) {
+        return ret.get();
+      } else {
+        return Collections.emptyList();
+      }
+    }
+  }
+
+  public class Name {
+    /**
+     * Resolve a IPNS name.
+     * See <a href="https://docs.ipfs.io/reference/http/api/#api-v0-name-resolve">name/resolve in IPFS doc</a>.
+     * @param hash the IPNS name to resolve
+     * @param timeout timeout value in seconds
+     * @param executor executor
+     * @return a Multihash of resolved name
+     */
+    public Optional<String> resolve(Multihash hash, int timeout, ExecutorService executor) {
+      AtomicReference<String> ret = new AtomicReference<>();
+      timeLimitedExec(
+        "name/resolve?arg=" + hash,
+        timeout,
+        res -> {
+          Map peer = (Map) res;
+          if (peer != null) {
+            ret.set((String) peer.get(("Path")));
+            return true;
+          }
+          return false;
+        },
+        err -> {
+          if (!(err instanceof TimeoutException)) {
+            throw new RuntimeException(err);
+          }
+        },
+        executor
+      );
+      return Optional.ofNullable(ret.get());
+    }
+  }
+
+  private void timeLimitedExec(String path, int timeout, Predicate<Object> processor, Consumer<Exception> error,
+                               ExecutorService executor) {
+    CompletableFuture<Void> f = CompletableFuture.runAsync(
+      ()-> getObjectStream(path, processor, error),
+      executor
+    );
+    try {
+      f.get(timeout, TimeUnit.SECONDS);
+    } catch (TimeoutException | ExecutionException | InterruptedException e) {
+      f.cancel(true);
+      error.accept(e);
+    }
+  }
+
+  private void getObjectStream(String path, Predicate<Object> processor, Consumer<Exception> error) {
+    byte LINE_FEED = (byte)10;
+
+    try {
+      InputStream in = getStream(path);
+      ByteArrayOutputStream resp = new ByteArrayOutputStream();
+
+      byte[] buf = new byte[4096];
+      int r;
+      while ((r = in.read(buf)) >= 0) {
+        resp.write(buf, 0, r);
+        if (buf[r - 1] == LINE_FEED) {
+          try {
+            boolean done = processor.test(JSONParser.parse(resp.toString()));
+            if (done) {
+              break;
+            }
+            resp.reset();
+          } catch (IllegalStateException e) {
+            in.close();
+            resp.close();
+            error.accept(e);
+          }
+        }
+      }
+      in.close();
+      resp.close();
+    } catch (IOException e) {
+      error.accept(e);
+    }
+  }
+
+  private InputStream getStream(String path) throws IOException {
+    URL target = new URL(protocol, host, port, version + path);
+    HttpURLConnection conn = configureConnection(target, "POST", readTimeout);
+    return conn.getInputStream();
+  }
+
+  private static HttpURLConnection configureConnection(URL target, String method, int timeout) throws IOException {

Review comment:
       Fixed in d2ea637.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r473949613



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSGroupScan.java
##########
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.cid.Cid;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.util.DrillVersionInfo;
+import org.apache.drill.exec.coord.ClusterCoordinator;
+import org.apache.drill.exec.physical.EndpointAffinity;
+import org.apache.drill.exec.physical.base.AbstractGroupScan;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.ScanStats;
+import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.exec.store.schedule.AffinityCreator;
+import org.apache.drill.exec.store.schedule.AssignmentCreator;
+import org.apache.drill.exec.store.schedule.CompleteWork;
+import org.apache.drill.exec.store.schedule.EndpointByteMap;
+import org.apache.drill.exec.store.schedule.EndpointByteMapImpl;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.base.Stopwatch;
+import org.apache.drill.shaded.guava.com.google.common.cache.LoadingCache;
+import org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap;
+import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Random;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+
+@JsonTypeName("ipfs-scan")
+public class IPFSGroupScan extends AbstractGroupScan {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSGroupScan.class);
+  private final IPFSContext ipfsContext;
+  private final IPFSScanSpec ipfsScanSpec;
+  private final IPFSStoragePluginConfig config;
+  private List<SchemaPath> columns;
+
+  private static final long DEFAULT_NODE_SIZE = 1000L;
+  public static final int DEFAULT_USER_PORT = 31010;
+  public static final int DEFAULT_CONTROL_PORT = 31011;
+  public static final int DEFAULT_DATA_PORT = 31012;
+  public static final int DEFAULT_HTTP_PORT = 8047;
+
+  private ListMultimap<Integer, IPFSWork> assignments;
+  private List<IPFSWork> ipfsWorkList = Lists.newArrayList();
+  private ListMultimap<String, IPFSWork> endpointWorksMap;
+  private List<EndpointAffinity> affinities;
+
+  @JsonCreator
+  public IPFSGroupScan(@JsonProperty("IPFSScanSpec") IPFSScanSpec ipfsScanSpec,
+                       @JsonProperty("IPFSStoragePluginConfig") IPFSStoragePluginConfig ipfsStoragePluginConfig,
+                       @JsonProperty("columns") List<SchemaPath> columns,
+                       @JacksonInject StoragePluginRegistry pluginRegistry) {
+    this(
+        pluginRegistry.resolve(ipfsStoragePluginConfig, IPFSStoragePlugin.class).getIPFSContext(),
+        ipfsScanSpec,
+        columns
+    );
+  }
+
+  public IPFSGroupScan(IPFSContext ipfsContext,
+                       IPFSScanSpec ipfsScanSpec,
+                       List<SchemaPath> columns) {
+    super((String) null);
+    this.ipfsContext = ipfsContext;
+    this.ipfsScanSpec = ipfsScanSpec;
+    this.config = ipfsContext.getStoragePluginConfig();
+    logger.debug("GroupScan constructor called with columns {}", columns);
+    this.columns = columns == null || columns.size() == 0? ALL_COLUMNS : columns;
+    init();
+  }
+
+  private void init() {
+    IPFSHelper ipfsHelper = ipfsContext.getIPFSHelper();
+    endpointWorksMap = ArrayListMultimap.create();
+
+    Multihash topHash = ipfsScanSpec.getTargetHash(ipfsHelper);
+    try {
+      Map<Multihash, String> leafAddrMap = getLeafAddrMappings(topHash);
+      logger.debug("Iterating on {} leaves...", leafAddrMap.size());
+      ClusterCoordinator coordinator = ipfsContext.getStoragePlugin().getContext().getClusterCoordinator();
+      for (Multihash leaf : leafAddrMap.keySet()) {
+        String peerHostname = leafAddrMap.get(leaf);
+
+        Optional<DrillbitEndpoint> oep = coordinator.getAvailableEndpoints()
+            .stream()
+            .filter(a -> a.getAddress().equals(peerHostname))
+            .findAny();
+        DrillbitEndpoint ep;
+        if (oep.isPresent()) {
+          ep = oep.get();
+          logger.debug("Using existing endpoint {}", ep.getAddress());
+        } else {
+          logger.debug("created new endpoint on the fly {}", peerHostname);
+          //DRILL-7754: read ports & version info from IPFS instead of hard-coded
+          ep = DrillbitEndpoint.newBuilder()
+              .setAddress(peerHostname)
+              .setUserPort(DEFAULT_USER_PORT)
+              .setControlPort(DEFAULT_CONTROL_PORT)
+              .setDataPort(DEFAULT_DATA_PORT)
+              .setHttpPort(DEFAULT_HTTP_PORT)
+              .setVersion(DrillVersionInfo.getVersion())
+              .setState(DrillbitEndpoint.State.ONLINE)
+              .build();
+          //DRILL-7777: how to safely remove endpoints that are no longer needed once the query is completed?
+          ClusterCoordinator.RegistrationHandle handle = coordinator.register(ep);

Review comment:
       I don't know how to produce a testcase that will cause it to fail. I tested successfully in a two node cluster with queries that involves data from the HTTP storage plugin, the classpath plugin and this plugin, combines join, filter and sort operators and nested subqueries. If @vvysotskyi could provide a testcase that shows these dynamically added endpoints can be a problem, I can look into that and see what solutions we can find. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446247644



##########
File path: common/src/test/java/org/apache/drill/categories/IPFSStorageTest.java
##########
@@ -0,0 +1,8 @@
+package org.apache.drill.categories;
+
+/**
+ * This is a category used to mark unit tests that test the Hive storage plugin.

Review comment:
       Fixed in 56151c4.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r446665500



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSPeer.java
##########
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+
+public class IPFSPeer {
+  private IPFSHelper helper;
+
+  private Multihash id;
+  private List<MultiAddress> addrs;
+  private boolean isDrillReady;
+  private boolean isDrillReadyChecked = false;
+  private Optional<String> drillbitAddress = Optional.empty();
+  private boolean drillbitAddressChecked = false;
+
+
+  public IPFSPeer(IPFSHelper helper, Multihash id) {
+    this.helper = helper;
+    this.id = id;
+  }
+
+  IPFSPeer(IPFSHelper helper, Multihash id, List<MultiAddress> addrs) {
+    this.helper = helper;
+    this.id = id;
+    this.addrs = addrs;
+    this.isDrillReady = helper.isDrillReady(id);
+    this.isDrillReadyChecked = true;
+    this.drillbitAddress = IPFSHelper.pickPeerHost(addrs);
+    this.drillbitAddressChecked = true;
+  }
+
+  public boolean isDrillReady() {
+    if (!isDrillReadyChecked) {
+      isDrillReady = helper.isDrillReady(id);
+      isDrillReadyChecked = true;
+    }
+    return isDrillReady;
+  }
+
+  public boolean hasDrillbitAddress() {

Review comment:
       Changed in 160a909.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] vvysotskyi commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
vvysotskyi commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r444052723



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSContext.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Bowen Ding, Yuedong Xu, Liang Wang

Review comment:
       Neither current code nor docs don't have any mention about their authors, and I don't think that it should be changed. As I mentioned above, it would be hard to keep a list of **all** contributors for the specific code part. Git history allows to see specific contribution, but just putting the author name to the code doesn't show anything related to the changes. What would we do with authorship if the plugin will be rewritten in the future from scratch, or significantly refactored, so it would differ significantly from its initial version? Using just git history solves this issue, and I don't think that we should introduce more complexity. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] cgivre commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
cgivre commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r443805436



##########
File path: contrib/storage-ipfs/README.zh.md
##########
@@ -0,0 +1,184 @@
+# Drill Storage Plugin for IPFS

Review comment:
       @vvysotskyi 
   Can they put a link to the Chinese version somewhere?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r452297400



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.physical.base.AbstractBase;
+import org.apache.drill.exec.physical.base.PhysicalOperator;
+import org.apache.drill.exec.physical.base.PhysicalVisitor;
+import org.apache.drill.exec.physical.base.SubScan;
+import org.apache.drill.exec.store.StoragePluginRegistry;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/*import org.apache.drill.common.expression.SchemaPath;*/
+
+@JsonTypeName("ipfs-sub-scan")
+public class IPFSSubScan extends AbstractBase implements SubScan {
+  private static int IPFS_SUB_SCAN_VALUE = 19155;
+  private final IPFSContext ipfsContext;
+  private final List<Multihash> ipfsSubScanSpecList;

Review comment:
       Yes, which variant of `List` it is doesn't really matter, but the rest of the code does not rely on a specific implementation of `List`, either. I made a `LinkedList` instance here and that was a mistake:
   https://github.com/apache/drill/blob/df4a7b2993e6752481d6b35d636f5fef4a20aebf/contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSSubScan.java#L182
   
   Should I change it to `ArrayList<Multihash>`? I mean using the interface as the type seems like the default way.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on pull request #2084:
URL: https://github.com/apache/drill/pull/2084#issuecomment-673615795


   > I'm still having issues actually getting the unit tests that require the IPFS daemon to actually execute.
   
   @cgivre Actually I am having trouble making that test run, too. I keep getting errors like "connection rejected: /127.0.0.1:31011" or "Protocol family unavailable: /0:0:0:0:0:0:0:1:31011". I can test successfully manually through the web ui with drill-embedded, though.
   
   Can you try testing through the web ui, too? The simple dataset should be easy to add to IPFS and test:
   
   ```bash
   ipfs object patch set-data $(ipfs object new) <path-to-simple-dataset.json>
   ```
   
   This will return the hash of the simple dataset, which is `QmcbeavnEofA6NjG7vkpe1yLJo6En6ML4JnDooDn1BbKmR`.
   
   Then run a query through the web ui: ``select * from ipfs.`/ipfs/QmcbeavnEofA6NjG7vkpe1yLJo6En6ML4JnDooDn1BbKmR#json` `` .
   If the query takes too long to complete, try reducing the timeout values as well as the `max-peers-per-leaf` value in the plugin config.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [drill] dbw9580 commented on a change in pull request #2084: DRILL-7745: Add storage plugin for IPFS

Posted by GitBox <gi...@apache.org>.
dbw9580 commented on a change in pull request #2084:
URL: https://github.com/apache/drill/pull/2084#discussion_r470676009



##########
File path: contrib/storage-ipfs/src/main/java/org/apache/drill/exec/store/ipfs/IPFSHelper.java
##########
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.drill.exec.store.ipfs;
+
+import io.ipfs.api.IPFS;
+import io.ipfs.api.MerkleNode;
+import io.ipfs.multiaddr.MultiAddress;
+import io.ipfs.multihash.Multihash;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.bouncycastle.util.Strings;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FETCH_DATA;
+import static org.apache.drill.exec.store.ipfs.IPFSStoragePluginConfig.IPFSTimeOut.FIND_PEER_INFO;
+
+/**
+ * Helper class with some utilities that are specific to Drill with an IPFS storage
+ */
+public class IPFSHelper {
+  private static final Logger logger = LoggerFactory.getLogger(IPFSHelper.class);
+
+  public static final String IPFS_NULL_OBJECT_HASH = "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n";
+  public static final Multihash IPFS_NULL_OBJECT = Multihash.fromBase58(IPFS_NULL_OBJECT_HASH);
+
+  private ExecutorService executorService;
+  private final IPFS client;
+  private final IPFSCompat clientCompat;
+  private IPFSPeer myself;
+  private int maxPeersPerLeaf;
+  private Map<IPFSTimeOut, Integer> timeouts;
+
+  public IPFSHelper(IPFS ipfs) {
+    this.client = ipfs;
+    this.clientCompat = new IPFSCompat(ipfs);
+  }
+
+  public IPFSHelper(IPFS ipfs, ExecutorService executorService) {
+    this(ipfs);
+    this.executorService = executorService;
+  }
+
+  public void setTimeouts(Map<IPFSTimeOut, Integer> timeouts) {
+    this.timeouts = timeouts;
+  }
+
+  public void setMyself(IPFSPeer myself) {
+    this.myself = myself;
+  }
+
+  /**
+   * Set maximum number of providers per leaf node. The more providers, the more time it takes to do DHT queries, while
+   * it is more likely we can find an optimal peer.
+   * @param maxPeersPerLeaf max number of providers to search per leaf node
+   */
+  public void setMaxPeersPerLeaf(int maxPeersPerLeaf) {
+    this.maxPeersPerLeaf = maxPeersPerLeaf;
+  }
+
+  public IPFS getClient() {
+    return client;
+  }
+
+  public IPFSCompat getClientCompat() {
+    return clientCompat;
+  }
+
+  public List<Multihash> findprovsTimeout(Multihash id) {
+    List<String> providers;
+    providers = clientCompat.dht.findprovsListTimeout(id, maxPeersPerLeaf, timeouts.get(IPFSTimeOut.FIND_PROV), executorService);
+
+    return providers.stream().map(Multihash::fromBase58).collect(Collectors.toList());
+  }
+
+  public List<MultiAddress> findpeerTimeout(Multihash peerId) {
+    // trying to resolve addresses of a node itself will always hang
+    // so we treat it specially
+    if(peerId.equals(myself.getId())) {
+      return myself.getMultiAddresses();
+    }
+
+    List<String> addrs;
+    addrs = clientCompat.dht.findpeerListTimeout(peerId, timeouts.get(IPFSTimeOut.FIND_PEER_INFO), executorService);
+    return addrs.stream()
+        .filter(addr -> !addr.equals(""))
+        .map(MultiAddress::new).collect(Collectors.toList());
+  }
+
+  public byte[] getObjectDataTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::data, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public MerkleNode getObjectLinksTimeout(Multihash object) throws IOException {
+    return timedFailure(client.object::links, object, timeouts.get(IPFSTimeOut.FETCH_DATA));
+  }
+
+  public IPFSPeer getMyself() throws IOException {
+    if (this.myself != null) {
+      return this.myself;
+    }
+
+    Map res = timedFailure(client::id, timeouts.get(FIND_PEER_INFO));
+    Multihash myID = Multihash.fromBase58((String) res.get("ID"));
+    // Rule out any non-local addresses as they might be NAT-ed external
+    // addresses that are not always reachable from the inside.
+    // But is it safe to assume IPFS always listens on loopback and local addresses?
+    List<MultiAddress> myAddrs = ((List<String>) res.get("Addresses"))
+        .stream()
+        .map(MultiAddress::new)
+        .filter(addr -> {
+          try {
+            InetAddress inetAddress = InetAddress.getByName(addr.getHost());
+            return inetAddress.isSiteLocalAddress()
+                || inetAddress.isLinkLocalAddress()
+                || inetAddress.isLoopbackAddress();
+          } catch (UnknownHostException e) {
+            return false;
+          }
+        })
+        .collect(Collectors.toList());
+    this.myself = new IPFSPeer(this, myID, myAddrs);
+
+    return this.myself;
+  }
+
+  public Multihash resolve(String prefix, String path, boolean recursive) {
+    Map<String, String> result = timedFailure(
+        (args) -> clientCompat.resolve((String) args.get(0), (String) args.get(1), (boolean) args.get(2)),
+        ImmutableList.<Object>of(prefix, path, recursive),
+        timeouts.get(IPFSTimeOut.FIND_PEER_INFO)
+    );
+    if (!result.containsKey("Path")) {
+      return null;
+    }
+
+    // the path returned is of form /ipfs/Qma...
+    String hashString = result.get("Path").split("/")[2];
+    return Multihash.fromBase58(hashString);
+  }
+
+  @FunctionalInterface
+  public interface ThrowingFunction<T, R, E extends Exception>{
+    R apply(final T in) throws E;
+  }
+
+  @FunctionalInterface
+  public interface ThrowingSupplier<R, E extends Exception> {
+    R get() throws E;
+  }
+
+  /**
+   * Execute a time-critical operation op within time timeout. Causes the query to fail completely
+   * if the operation times out.
+   * @param op a Function that represents the operation to perform
+   * @param in the parameter for op
+   * @param timeout consider the execution has timed out after this amount of time in seconds
+   * @param <T> Input type
+   * @param <R> Return type
+   * @param <E> Type of checked exception op throws
+   * @return R the result of the operation
+   * @throws E when the function throws an E
+   */
+  public <T, R, E extends Exception> R timedFailure(ThrowingFunction<T, R, E> op, T in, int timeout) throws E {
+    Callable<R> task = () -> op.apply(in);
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  public <R, E extends Exception> R timedFailure(ThrowingSupplier<R, E> op, int timeout) throws E {
+    Callable<R> task = op::get;
+    return timedFailure(task, timeout, TimeUnit.SECONDS);
+  }
+
+  private <R, E extends Exception> R timedFailure(Callable<R> task, int timeout, TimeUnit timeUnit) throws E {
+    Future<R> res = executorService.submit(task);
+    try {
+      return res.get(timeout, timeUnit);
+    } catch (ExecutionException e) {
+      throw (E) e.getCause();

Review comment:
       I'd like to keep it that way. The caller could recover from the exception.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org