You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2014/11/26 11:05:04 UTC

svn commit: r1641787 - /jena/site/trunk/content/documentation/hadoop/index.mdtext

Author: rvesse
Date: Wed Nov 26 10:05:04 2014
New Revision: 1641787

URL: http://svn.apache.org/r1641787
Log:
Finish first pass of RDF Tools for Hadoop index page

Modified:
    jena/site/trunk/content/documentation/hadoop/index.mdtext

Modified: jena/site/trunk/content/documentation/hadoop/index.mdtext
URL: http://svn.apache.org/viewvc/jena/site/trunk/content/documentation/hadoop/index.mdtext?rev=1641787&r1=1641786&r2=1641787&view=diff
==============================================================================
--- jena/site/trunk/content/documentation/hadoop/index.mdtext (original)
+++ jena/site/trunk/content/documentation/hadoop/index.mdtext Wed Nov 26 10:05:04 2014
@@ -18,7 +18,7 @@ underlying plumbing.
     - [Map/Reduce](mapred.html)
 - Examples
     - [RDF Stats Demo](demo.html)
-- [Maven Artifacts for Jena JDBC](artifacts.html)
+- [Maven Artifacts](artifacts.html)
 
 ## Overview
 
@@ -60,11 +60,13 @@ on what you are trying to do.  Typically
       <version>x.y.z</version>
     </dependency>
 
-Our libraries depend on the relevant Hadoop libraries but since these libraries are provided by the cluster those dependencies are marked as `provided` and thus are not transitive.  This means that you will typically also need to add the following additional dependencies:
+Our libraries depend on the relevant Hadoop libraries but since these libraries are typically provided by the Hadoop cluster those dependencies are marked as `provided` and thus are not transitive.  This means that you will typically also need to add the following additional dependencies:
 
     <!-- Hadoop Dependencies -->
-    <!-- Note these will be provided on the Hadoop cluster hence the provided 
-            scope -->
+    <!-- 
+        Note these will be provided on the Hadoop cluster hence the provided 
+        scope 
+    -->
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-common</artifactId>
@@ -91,17 +93,15 @@ We will start with our `Mapper` implemen
 then outputs each node with an initial count of 1:
 
     package org.apache.jena.hadoop.rdf.mapreduce.count;
-
+    
     import org.apache.jena.hadoop.rdf.types.NodeWritable;
     import org.apache.jena.hadoop.rdf.types.TripleWritable;
     import com.hp.hpl.jena.graph.Triple;
-
+    
     /**
      * A mapper for counting node usages within triples designed primarily for use
      * in conjunction with {@link NodeCountReducer}
-     * 
-     * 
-     * 
+     *
      * @param <TKey> Key type
      */
     public class TripleNodeCountMapper<TKey> extends AbstractNodeTupleNodeCountMapper<TKey, Triple, TripleWritable> {
@@ -149,57 +149,60 @@ us with support for our desired RDF inpu
 
     package org.apache.jena.hadoop.rdf.stats;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
-import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
-import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
-import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
-import org.apache.jena.hadoop.rdf.types.NodeWritable;
-
-public class RdfMapReduceExample {
-
-    public static void main(String[] args) {
-        try {
-            // Get Hadoop configuration
-            Configuration config = new Configuration(true);
-
-            // Create job
-            Job job = Job.getInstance(config);
-            job.setJarByClass(RdfMapReduceExample.class);
-            job.setJobName("RDF Triples Node Usage Count");
-
-            // Map/Reduce classes
-            job.setMapperClass(TripleNodeCountMapper.class);
-            job.setMapOutputKeyClass(NodeWritable.class);
-            job.setMapOutputValueClass(LongWritable.class);
-            job.setReducerClass(NodeCountReducer.class);
-
-            // Input and Output
-            job.setInputFormatClass(TriplesInputFormat.class);
-            job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
-            FileInputFormat.setInputPaths(job, new Path("/example/input/"));
-            FileOutputFormat.setOutputPath(job, new Path("/example/output/"));
-
-            // Launch the job and await completion
-            job.submit();
-            if (job.monitorAndPrintJob()) {
-                // OK
-                System.out.println("Completed");
-            } else {
-                // Failed
-                System.err.println("Failed");
+    import org.apache.hadoop.conf.Configuration;
+    import org.apache.hadoop.fs.Path;
+    import org.apache.hadoop.io.LongWritable;
+    import org.apache.hadoop.mapreduce.Job;
+    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+    import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
+    import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
+    import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
+    import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
+    import org.apache.jena.hadoop.rdf.types.NodeWritable;
+    
+    public class RdfMapReduceExample {
+
+        public static void main(String[] args) {
+            try {
+                // Get Hadoop configuration
+                Configuration config = new Configuration(true);
+
+                // Create job
+                Job job = Job.getInstance(config);
+                job.setJarByClass(RdfMapReduceExample.class);
+                job.setJobName("RDF Triples Node Usage Count");
+ 
+                // Map/Reduce classes
+                job.setMapperClass(TripleNodeCountMapper.class);
+                job.setMapOutputKeyClass(NodeWritable.class);
+                job.setMapOutputValueClass(LongWritable.class);
+                job.setReducerClass(NodeCountReducer.class);
+
+                // Input and Output
+                job.setInputFormatClass(TriplesInputFormat.class);
+                job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
+                FileInputFormat.setInputPaths(job, new Path("/example/input/"));
+                FileOutputFormat.setOutputPath(job, new Path("/example/output/"));
+
+                // Launch the job and await completion
+                job.submit();
+                if (job.monitorAndPrintJob()) {
+                    // OK
+                    System.out.println("Completed");
+                } else {
+                    // Failed
+                    System.err.println("Failed");
+                }
+            } catch (Throwable e) {
+                e.printStackTrace();
             }
-        } catch (Throwable e) {
-            e.printStackTrace();
         }
     }
-}
 
+So this really is no different from configuring any other Hadoop job, we simply have to point to the relevant input and output formats and provide our mapper and reducer.  Note that here we use the `TriplesInputFormat` which can handle RDF in any Jena supported format, if you know your RDF is in a specific format it is usually more efficient to use a more specific input format.  Please see the [IO](io.html) page for more detail on the available input formats and the differences between them.
+
+We recommend that you next take a look at our [RDF Stats Demo](demo.html) which shows how to do some more complex computations by chaining multiple jobs together.
 
 ## APIs