You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tinkerpop.apache.org by ok...@apache.org on 2015/03/18 15:51:39 UTC

incubator-tinkerpop git commit: there is no longer a distinction between spark and giraph properties files in conf/. You can have the same properties file with both spark and giraph graph computer configurations specified. Much cleaner and easier to thin

Repository: incubator-tinkerpop
Updated Branches:
  refs/heads/master c53d61e1b -> 0c3b036f8


there is no longer a distinction between spark and giraph properties files in conf/. You can have the same properties file with both spark and giraph graph computer configurations specified. Much cleaner and easier to think about and manage.


Project: http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/commit/0c3b036f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/tree/0c3b036f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/diff/0c3b036f

Branch: refs/heads/master
Commit: 0c3b036f839590ea34db2ad2b0b2dcbd2e574df4
Parents: c53d61e
Author: Marko A. Rodriguez <ok...@gmail.com>
Authored: Wed Mar 18 08:51:25 2015 -0600
Committer: Marko A. Rodriguez <ok...@gmail.com>
Committed: Wed Mar 18 08:51:36 2015 -0600

----------------------------------------------------------------------
 hadoop-gremlin/conf/giraph-graphson.properties  | 57 --------------------
 hadoop-gremlin/conf/giraph-gryo.properties      | 41 --------------
 hadoop-gremlin/conf/hadoop-graphson.properties  | 35 ++++++++++++
 hadoop-gremlin/conf/hadoop-gryo.properties      | 48 +++++++++++++++++
 hadoop-gremlin/conf/spark-gryo.properties       | 39 --------------
 .../computer/spark/SparkGraphComputer.java      |  4 +-
 .../computer/spark/util/SparkHelper.java        | 14 +++--
 7 files changed, 96 insertions(+), 142 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/conf/giraph-graphson.properties
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/conf/giraph-graphson.properties b/hadoop-gremlin/conf/giraph-graphson.properties
deleted file mode 100644
index b6f28c6..0000000
--- a/hadoop-gremlin/conf/giraph-graphson.properties
+++ /dev/null
@@ -1,57 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# the graph class
-gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
-# i/o formats for graphs and memory (i.e. computer result)
-gremlin.hadoop.graphInputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONInputFormat
-gremlin.hadoop.graphOutputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONOutputFormat
-gremlin.hadoop.memoryOutputFormat=org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
-# i/o locations
-gremlin.hadoop.inputLocation=tinkerpop-modern-vertices.ldjson
-gremlin.hadoop.outputLocation=output
-# deriving a complete view of the memory requires an extra mapreduce job and thus, if not needed, should be avoided
-gremlin.hadoop.deriveMemory=false
-# if the job jars are not on the classpath of every hadoop node, then they must be provided to the distributed cache at runtime
-gremlin.hadoop.jarsInDistributedCache=true
-# the vertex program to execute
-gremlin.vertexProgram=org.apache.tinkerpop.gremlin.process.computer.ranking.pagerank.PageRankVertexProgram
-
-# It is possible to provide Giraph configuration parameters for use with GiraphGraphComputer
-############################################################################################
-giraph.minWorkers=2
-giraph.maxWorkers=2
-# giraph.useInputSplitLocality=false
-# giraph.logLevel=debug
-
-# It is possible to provide Hadoop configuration parameters.
-# Note that these parameters are provided to each MapReduce job within the entire Hadoop-Gremlin job pipeline.
-# Some of these parameters may be over written by Hadoop-Gremlin as deemed necessary.
-##############################################################################################################
-# mapred.linerecordreader.maxlength=5242880
-# mapred.map.child.java.opts=-Xmx1024m
-# mapred.reduce.child.java.opts=-Xmx1024m
-# mapred.map.tasks=6
-# mapred.reduce.tasks=3
-# mapred.job.reuse.jvm.num.tasks=-1
-# mapred.task.timeout=5400000
-# mapred.reduce.parallel.copies=50
-# io.sort.factor=100
-# io.sort.mb=200
-
-
-

http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/conf/giraph-gryo.properties
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/conf/giraph-gryo.properties b/hadoop-gremlin/conf/giraph-gryo.properties
deleted file mode 100644
index 9d999a6..0000000
--- a/hadoop-gremlin/conf/giraph-gryo.properties
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
-gremlin.hadoop.graphInputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoInputFormat
-gremlin.hadoop.graphOutputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoOutputFormat
-gremlin.hadoop.memoryOutputFormat=org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
-gremlin.hadoop.deriveMemory=false
-gremlin.hadoop.jarsInDistributedCache=true
-
-gremlin.hadoop.inputLocation=tinkerpop-modern-vertices.kryo
-gremlin.hadoop.outputLocation=output
-#gremlin.vertexProgram=org.apache.tinkerpop.gremlin.process.computer.traversal.TraversalVertexProgram
-#gremlin.traversalVertexProgram.traversalSupplier.type=CLASS
-#gremlin.traversalVertexProgram.traversalSupplier.object=org.apache.tinkerpop.gremlin.hadoop.process.computer.example.TraversalSupplier1
-
-giraph.minWorkers=2
-giraph.maxWorkers=2
-giraph.maxPartitionsInMemory=1
-giraph.userPartitionCount=2
-giraph.useOutOfCoreGraph=true
-giraph.isStaticGraph=true
-mapred.map.child.java.opts=-Xmx1024m
-mapred.reduce.child.java.opts=-Xmx1024m
-giraph.numInputThreads=4
-giraph.numComputeThreads=4
-giraph.vertexOutputFormatThreadSafe=true
-giraph.numOutputThreads=4
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/conf/hadoop-graphson.properties
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/conf/hadoop-graphson.properties b/hadoop-gremlin/conf/hadoop-graphson.properties
new file mode 100644
index 0000000..782e979
--- /dev/null
+++ b/hadoop-gremlin/conf/hadoop-graphson.properties
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# the graph class
+gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
+# i/o formats for graphs and memory (i.e. computer result)
+gremlin.hadoop.graphInputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONInputFormat
+gremlin.hadoop.graphOutputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONOutputFormat
+gremlin.hadoop.memoryOutputFormat=org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
+# i/o locations
+gremlin.hadoop.inputLocation=tinkerpop-modern-vertices.ldjson
+gremlin.hadoop.outputLocation=output
+# deriving a complete view of the memory requires an extra mapreduce job and thus, if not needed, should be avoided
+gremlin.hadoop.deriveMemory=false
+# if the job jars are not on the classpath of every hadoop node, then they must be provided to the distributed cache at runtime
+gremlin.hadoop.jarsInDistributedCache=true
+# the vertex program to execute
+gremlin.vertexProgram=org.apache.tinkerpop.gremlin.process.computer.ranking.pagerank.PageRankVertexProgram
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/conf/hadoop-gryo.properties
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/conf/hadoop-gryo.properties b/hadoop-gremlin/conf/hadoop-gryo.properties
new file mode 100644
index 0000000..7677bd2
--- /dev/null
+++ b/hadoop-gremlin/conf/hadoop-gryo.properties
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
+gremlin.hadoop.graphInputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoInputFormat
+gremlin.hadoop.graphOutputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoOutputFormat
+gremlin.hadoop.memoryOutputFormat=org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
+gremlin.hadoop.deriveMemory=false
+gremlin.hadoop.jarsInDistributedCache=true
+
+gremlin.hadoop.inputLocation=tinkerpop-modern-vertices.kryo
+gremlin.hadoop.outputLocation=output
+
+#####################################
+# GiraphGraphComputer Configuration #
+#####################################
+giraph.minWorkers=2
+giraph.maxWorkers=2
+giraph.useOutOfCoreGraph=true
+mapred.map.child.java.opts=-Xmx1024m
+mapred.reduce.child.java.opts=-Xmx1024m
+giraph.numInputThreads=4
+giraph.numComputeThreads=4
+# giraph.vertexOutputFormatThreadSafe=true
+# giraph.numOutputThreads=4
+# giraph.maxPartitionsInMemory=1
+# giraph.userPartitionCount=2
+
+####################################
+# SparkGraphComputer Configuration #
+####################################
+spark.master=local[4]
+spark.executor.memory=1024m
+spark.eventLog.enabled=true
+spark.serializer=org.apache.spark.serializer.JavaSerializer
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/conf/spark-gryo.properties
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/conf/spark-gryo.properties b/hadoop-gremlin/conf/spark-gryo.properties
deleted file mode 100644
index bfaa6ee..0000000
--- a/hadoop-gremlin/conf/spark-gryo.properties
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
-gremlin.hadoop.defaultGraphComputer=org.apache.tinkerpop.gremlin.hadoop.process.computer.spark.SparkGraphComputer
-gremlin.hadoop.graphInputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoInputFormat
-gremlin.hadoop.graphOutputFormat=org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoOutputFormat
-gremlin.hadoop.memoryOutputFormat=org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
-gremlin.hadoop.deriveMemory=false
-gremlin.hadoop.jarsInDistributedCache=false
-
-gremlin.hadoop.inputLocation=hdfs://localhost:9000/user/marko/tinkerpop-modern-vertices.kryo
-gremlin.hadoop.outputLocation=output
-
-# the vertex program to execute
-gremlin.vertexProgram=org.apache.tinkerpop.gremlin.process.computer.ranking.pagerank.PageRankVertexProgram
-
-# It is possible to provide Spark configuration parameters for use with SparkGraphComputer
-##########################################################################################
-spark.master=local[4]
-spark.executor.memory=1024m
-spark.eventLog.enabled=true
-spark.serializer=org.apache.spark.serializer.JavaSerializer
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/SparkGraphComputer.java
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/SparkGraphComputer.java b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/SparkGraphComputer.java
index cd243e5..0dae435 100644
--- a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/SparkGraphComputer.java
+++ b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/SparkGraphComputer.java
@@ -119,14 +119,14 @@ public final class SparkGraphComputer implements GraphComputer {
         return CompletableFuture.<ComputerResult>supplyAsync(() -> {
                     final long startTime = System.currentTimeMillis();
                     SparkMemory memory = null;
-                    SparkHelper.deleteOutputDirectory(hadoopConfiguration);
+                    SparkHelper.deleteOutputLocation(hadoopConfiguration);
 
                     // wire up a spark context
                     final SparkConf sparkConfiguration = new SparkConf();
                     sparkConfiguration.setAppName(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "[" + this.mapReducers + "]");
                     hadoopConfiguration.forEach(entry -> sparkConfiguration.set(entry.getKey(), entry.getValue()));
                     if (FileInputFormat.class.isAssignableFrom(hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_INPUT_FORMAT, InputFormat.class)))
-                        hadoopConfiguration.set(Constants.MAPRED_INPUT_DIR, hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION)); // necessary for Spark and newAPIHadoopRDD
+                        hadoopConfiguration.set(Constants.MAPRED_INPUT_DIR, SparkHelper.getInputLocation(hadoopConfiguration)); // necessary for Spark and newAPIHadoopRDD
                     // execute the vertex program and map reducers and if there is a failure, auto-close the spark context
                     try (final JavaSparkContext sparkContext = new JavaSparkContext(sparkConfiguration)) {
                         // add the project jars to the cluster

http://git-wip-us.apache.org/repos/asf/incubator-tinkerpop/blob/0c3b036f/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/util/SparkHelper.java
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/util/SparkHelper.java b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/util/SparkHelper.java
index 478e805..cff5b87 100644
--- a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/util/SparkHelper.java
+++ b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/process/computer/spark/util/SparkHelper.java
@@ -130,17 +130,25 @@ public final class SparkHelper {
         return reduceRDD;
     }
 
-    public static void deleteOutputDirectory(final org.apache.hadoop.conf.Configuration hadoopConfiguration) {
-        final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
+    public static void deleteOutputLocation(final org.apache.hadoop.conf.Configuration hadoopConfiguration) {
+        final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
         if (null != outputLocation) {
             try {
-                FileSystem.get(hadoopConfiguration).delete(new Path(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)), true);
+                FileSystem.get(hadoopConfiguration).delete(new Path(outputLocation), true);
             } catch (final IOException e) {
                 throw new IllegalStateException(e.getMessage(), e);
             }
         }
     }
 
+    public static String getInputLocation(final org.apache.hadoop.conf.Configuration hadoopConfiguration) {
+        try {
+            return FileSystem.get(hadoopConfiguration).getFileStatus(new Path(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION))).getPath().toString();
+        } catch (final IOException e) {
+            throw new IllegalStateException(e.getMessage(), e);
+        }
+    }
+
     public static <M> void saveGraphRDD(final JavaPairRDD<Object, SparkPayload<M>> graphRDD, final org.apache.hadoop.conf.Configuration hadoopConfiguration) {
         final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
         if (null != outputLocation) {