You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by pw...@apache.org on 2013/09/25 02:14:46 UTC

svn commit: r2978 [5/12] - in /dev/incubator/spark/spark-0.8.0-incubating-rc6-docs: ./ css/ img/ js/ js/vendor/

Added: dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/java-programming-guide.html
==============================================================================
--- dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/java-programming-guide.html (added)
+++ dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/java-programming-guide.html Wed Sep 25 00:14:43 2013
@@ -0,0 +1,373 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        <title>Java Programming Guide - Spark 0.8.0 Documentation</title>
+        <meta name="description" content="">
+
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <style>
+            body {
+                padding-top: 60px;
+                padding-bottom: 40px;
+            }
+        </style>
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
+        <link rel="stylesheet" href="css/main.css">
+
+        <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
+        
+        <link rel="stylesheet" href="css/pygments-default.css">
+
+        <!-- Google analytics script -->
+        <script type="text/javascript">
+          /*
+          var _gaq = _gaq || [];
+          _gaq.push(['_setAccount', 'UA-32518208-1']);
+          _gaq.push(['_trackPageview']);
+
+          (function() {
+            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+          })();
+          */
+        </script>
+
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
+
+        <div class="navbar navbar-fixed-top" id="topbar">
+            <div class="navbar-inner">
+                <div class="container">
+                    <div class="brand"><a href="index.html">
+                      <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">0.8.0</span>
+                    </div>
+                    <ul class="nav">
+                        <!--TODO(andyk): Add class="active" attribute to li some how.-->
+                        <li><a href="index.html">Overview</a></li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="quick-start.html">Quick Start</a></li>
+                                <li><a href="scala-programming-guide.html">Spark in Scala</a></li>
+                                <li><a href="java-programming-guide.html">Spark in Java</a></li>
+                                <li><a href="python-programming-guide.html">Spark in Python</a></li>
+                                <li class="divider"></li>
+                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+                                <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
+                                <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="api/core/index.html">Spark Core for Java/Scala</a></li>
+                                <li><a href="api/pyspark/index.html">Spark Core for Python</a></li>
+                                <li class="divider"></li>
+                                <li><a href="api/streaming/index.html">Spark Streaming</a></li>
+                                <li><a href="api/mllib/index.html">MLlib (Machine Learning)</a></li>
+                                <li><a href="api/bagel/index.html">Bagel (Pregel on Spark)</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="cluster-overview.html">Overview</a></li>
+                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
+                                <li><a href="spark-standalone.html">Standalone Mode</a></li>
+                                <li><a href="running-on-mesos.html">Mesos</a></li>
+                                <li><a href="running-on-yarn.html">YARN</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="configuration.html">Configuration</a></li>
+                                <li><a href="monitoring.html">Monitoring</a></li>
+                                <li><a href="tuning.html">Tuning Guide</a></li>
+                                <li><a href="hadoop-third-party-distributions.html">Running with CDH/HDP</a></li>
+                                <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
+                                <li><a href="job-scheduling.html">Job Scheduling</a></li>
+                                <li class="divider"></li>
+                                <li><a href="building-with-maven.html">Building Spark with Maven</a></li>
+                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+                            </ul>
+                        </li>
+                    </ul>
+                    <!--<p class="navbar-text pull-right"><span class="version-text">v0.8.0</span></p>-->
+                </div>
+            </div>
+        </div>
+
+        <div class="container" id="content">
+          <h1 class="title">Java Programming Guide</h1>
+
+          <p>The Spark Java API exposes all the Spark features available in the Scala version to Java.
+To learn the basics of Spark, we recommend reading through the
+<a href="scala-programming-guide.html">Scala programming guide</a> first; it should be
+easy to follow even if you don&#8217;t know Scala.
+This guide will show how to use the Spark features described there in Java.</p>
+
+<p>The Spark Java API is defined in the
+<a href="api/core/index.html#org.apache.spark.api.java.package"><code>org.apache.spark.api.java</code></a> package, and includes
+a <a href="api/core/index.html#org.apache.spark.api.java.JavaSparkContext"><code>JavaSparkContext</code></a> for
+initializing Spark and <a href="api/core/index.html#org.apache.spark.api.java.JavaRDD"><code>JavaRDD</code></a> classes,
+which support the same methods as their Scala counterparts but take Java functions and return
+Java data and collection types. The main differences have to do with passing functions to RDD
+operations (e.g. map) and handling RDDs of different types, as discussed next.</p>
+
+<h1 id="key-differences-in-the-java-api">Key Differences in the Java API</h1>
+
+<p>There are a few key differences between the Java and Scala APIs:</p>
+
+<ul>
+  <li>Java does not support anonymous or first-class functions, so functions must
+be implemented by extending the
+<a href="api/core/index.html#org.apache.spark.api.java.function.Function"><code>org.apache.spark.api.java.function.Function</code></a>,
+<a href="api/core/index.html#org.apache.spark.api.java.function.Function2"><code>Function2</code></a>, etc.
+classes.</li>
+  <li>To maintain type safety, the Java API defines specialized Function and RDD
+classes for key-value pairs and doubles. For example, 
+<a href="api/core/index.html#org.apache.spark.api.java.JavaPairRDD"><code>JavaPairRDD</code></a>
+stores key-value pairs.</li>
+  <li>RDD methods like <code>collect()</code> and <code>countByKey()</code> return Java collections types,
+such as <code>java.util.List</code> and <code>java.util.Map</code>.</li>
+  <li>Key-value pairs, which are simply written as <code>(key, value)</code> in Scala, are represented
+by the <code>scala.Tuple2</code> class, and need to be created using <code>new Tuple2&lt;K, V&gt;(key, value)</code>.</li>
+</ul>
+
+<h2 id="rdd-classes">RDD Classes</h2>
+
+<p>Spark defines additional operations on RDDs of key-value pairs and doubles, such
+as <code>reduceByKey</code>, <code>join</code>, and <code>stdev</code>.</p>
+
+<p>In the Scala API, these methods are automatically added using Scala&#8217;s
+<a href="http://www.scala-lang.org/node/130">implicit conversions</a> mechanism.</p>
+
+<p>In the Java API, the extra methods are defined in the
+<a href="api/core/index.html#org.apache.spark.api.java.JavaPairRDD"><code>JavaPairRDD</code></a>
+and <a href="api/core/index.html#org.apache.spark.api.java.JavaDoubleRDD"><code>JavaDoubleRDD</code></a>
+classes.  RDD methods like <code>map</code> are overloaded by specialized <code>PairFunction</code>
+and <code>DoubleFunction</code> classes, allowing them to return RDDs of the appropriate
+types.  Common methods like <code>filter</code> and <code>sample</code> are implemented by
+each specialized RDD class, so filtering a <code>PairRDD</code> returns a new <code>PairRDD</code>,
+etc (this acheives the &#8220;same-result-type&#8221; principle used by the <a href="http://docs.scala-lang.org/overviews/core/architecture-of-scala-collections.html">Scala collections
+framework</a>).</p>
+
+<h2 id="function-classes">Function Classes</h2>
+
+<p>The following table lists the function classes used by the Java API.  Each
+class has a single abstract method, <code>call()</code>, that must be implemented.</p>
+
+<table class="table">
+<tr><th>Class</th><th>Function Type</th></tr>
+
+<tr><td>Function&lt;T, R&gt;</td><td>T =&gt; R </td></tr>
+<tr><td>DoubleFunction&lt;T&gt;</td><td>T =&gt; Double </td></tr>
+<tr><td>PairFunction&lt;T, K, V&gt;</td><td>T =&gt; Tuple2&lt;K, V&gt; </td></tr>
+
+<tr><td>FlatMapFunction&lt;T, R&gt;</td><td>T =&gt; Iterable&lt;R&gt; </td></tr>
+<tr><td>DoubleFlatMapFunction&lt;T&gt;</td><td>T =&gt; Iterable&lt;Double&gt; </td></tr>
+<tr><td>PairFlatMapFunction&lt;T, K, V&gt;</td><td>T =&gt; Iterable&lt;Tuple2&lt;K, V&gt;&gt; </td></tr>
+
+<tr><td>Function2&lt;T1, T2, R&gt;</td><td>T1, T2 =&gt; R (function of two arguments)</td></tr>
+</table>
+
+<h2 id="storage-levels">Storage Levels</h2>
+
+<p>RDD <a href="scala-programming-guide.html#rdd-persistence">storage level</a> constants, such as <code>MEMORY_AND_DISK</code>, are
+declared in the <a href="api/core/index.html#org.apache.spark.api.java.StorageLevels">org.apache.spark.api.java.StorageLevels</a> class. To
+define your own storage level, you can use StorageLevels.create(&#8230;). </p>
+
+<h1 id="other-features">Other Features</h1>
+
+<p>The Java API supports other Spark features, including
+<a href="scala-programming-guide.html#accumulators">accumulators</a>,
+<a href="scala-programming-guide.html#broadcast-variables">broadcast variables</a>, and
+<a href="scala-programming-guide.html#rdd-persistence">caching</a>.</p>
+
+<h1 id="example">Example</h1>
+
+<p>As an example, we will implement word count using the Java API.</p>
+
+<div class="highlight"><pre><code class="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.*</span><span class="o">;</span>
+
+<span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="k">new</span> <span class="n">JavaSparkContext</span><span class="o">(...);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">lines</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">words</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="na">flatMap</span><span class="o">(</span>
+  <span class="k">new</span> <span class="n">FlatMapFunction</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
+    <span class="kd">public</span> <span class="n">Iterable</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">{</span>
+      <span class="k">return</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="n">s</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">));</span>
+    <span class="o">}</span>
+  <span class="o">}</span>
+<span class="o">);</span>
+</code></pre></div>
+
+<p>The word count program starts by creating a <code>JavaSparkContext</code>, which accepts
+the same parameters as its Scala counterpart.  <code>JavaSparkContext</code> supports the
+same data loading methods as the regular <code>SparkContext</code>; here, <code>textFile</code>
+loads lines from text files stored in HDFS.</p>
+
+<p>To split the lines into words, we use <code>flatMap</code> to split each line on
+whitespace.  <code>flatMap</code> is passed a <code>FlatMapFunction</code> that accepts a string and
+returns an <code>java.lang.Iterable</code> of strings.</p>
+
+<p>Here, the <code>FlatMapFunction</code> was created inline; another option is to subclass
+<code>FlatMapFunction</code> and pass an instance to <code>flatMap</code>:</p>
+
+<div class="highlight"><pre><code class="java"><span class="kd">class</span> <span class="nc">Split</span> <span class="kd">extends</span> <span class="n">FlatMapFunction</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;</span> <span class="o">{</span>
+  <span class="kd">public</span> <span class="n">Iterable</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">{</span>
+    <span class="k">return</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="n">s</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">));</span>
+  <span class="o">}</span>
+<span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">words</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="na">flatMap</span><span class="o">(</span><span class="k">new</span> <span class="n">Split</span><span class="o">());</span>
+</code></pre></div>
+
+<p>Continuing with the word count example, we map each word to a <code>(word, 1)</code> pair:</p>
+
+<div class="highlight"><pre><code class="java"><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>
+<span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="n">ones</span> <span class="o">=</span> <span class="n">words</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+  <span class="k">new</span> <span class="n">PairFunction</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
+    <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="n">call</span><span class="o">(</span><span class="n">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">{</span>
+      <span class="k">return</span> <span class="k">new</span> <span class="nf">Tuple2</span><span class="o">(</span><span class="n">s</span><span class="o">,</span> <span class="mi">1</span><span class="o">);</span>
+    <span class="o">}</span>
+  <span class="o">}</span>
+<span class="o">);</span>
+</code></pre></div>
+
+<p>Note that <code>map</code> was passed a <code>PairFunction&lt;String, String, Integer&gt;</code> and
+returned a <code>JavaPairRDD&lt;String, Integer&gt;</code>.</p>
+
+<p>To finish the word count program, we will use <code>reduceByKey</code> to count the
+occurrences of each word:</p>
+
+<div class="highlight"><pre><code class="java"><span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="n">counts</span> <span class="o">=</span> <span class="n">ones</span><span class="o">.</span><span class="na">reduceByKey</span><span class="o">(</span>
+  <span class="k">new</span> <span class="n">Function2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
+    <span class="kd">public</span> <span class="n">Integer</span> <span class="nf">call</span><span class="o">(</span><span class="n">Integer</span> <span class="n">i1</span><span class="o">,</span> <span class="n">Integer</span> <span class="n">i2</span><span class="o">)</span> <span class="o">{</span>
+      <span class="k">return</span> <span class="n">i1</span> <span class="o">+</span> <span class="n">i2</span><span class="o">;</span>
+    <span class="o">}</span>
+  <span class="o">}</span>
+<span class="o">);</span>
+</code></pre></div>
+
+<p>Here, <code>reduceByKey</code> is passed a <code>Function2</code>, which implements a function with
+two arguments.  The resulting <code>JavaPairRDD</code> contains <code>(word, count)</code> pairs.</p>
+
+<p>In this example, we explicitly showed each intermediate RDD.  It is also
+possible to chain the RDD transformations, so the word count example could also
+be written as:</p>
+
+<div class="highlight"><pre><code class="java"><span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="n">counts</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="na">flatMap</span><span class="o">(</span>
+    <span class="o">...</span>
+  <span class="o">).</span><span class="na">map</span><span class="o">(</span>
+    <span class="o">...</span>
+  <span class="o">).</span><span class="na">reduceByKey</span><span class="o">(</span>
+    <span class="o">...</span>
+  <span class="o">);</span>
+</code></pre></div>
+
+<p>There is no performance difference between these approaches; the choice is
+just a matter of style.</p>
+
+<h1 id="javadoc">Javadoc</h1>
+
+<p>We currently provide documentation for the Java API as Scaladoc, in the
+<a href="api/core/index.html#org.apache.spark.api.java.package"><code>org.apache.spark.api.java</code> package</a>, because
+some of the classes are implemented in Scala. The main downside is that the types and function
+definitions show Scala syntax (for example, <code>def reduce(func: Function2[T, T]): T</code> instead of
+<code>T reduce(Function2&lt;T, T&gt; func)</code>). 
+We hope to generate documentation with Java-style syntax in the future.</p>
+
+<h1 id="where-to-go-from-here">Where to Go from Here</h1>
+
+<p>Spark includes several sample programs using the Java API in
+<a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/examples"><code>examples/src/main/java</code></a>.  You can run them by passing the class name to the
+<code>run-example</code> script included in Spark; for example:</p>
+
+<pre><code>./run-example org.apache.spark.examples.JavaWordCount
+</code></pre>
+
+<p>Each example program prints usage help when run
+without any arguments.</p>
+
+            <!-- Main hero unit for a primary marketing message or call to action -->
+            <!--<div class="hero-unit">
+                <h1>Hello, world!</h1>
+                <p>This is a template for a simple marketing or informational website. It includes a large callout called the hero unit and three supporting pieces of content. Use it as a starting point to create something more unique.</p>
+                <p><a class="btn btn-primary btn-large">Learn more &raquo;</a></p>
+            </div>-->
+
+            <!-- Example row of columns -->
+            <!--<div class="row">
+                <div class="span4">
+                    <h2>Heading</h2>
+                    <p>Donec id elit non mi porta gravida at eget metus. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus. Etiam porta sem malesuada magna mollis euismod. Donec sed odio dui. </p>
+                    <p><a class="btn" href="#">View details &raquo;</a></p>
+                </div>
+                <div class="span4">
+                    <h2>Heading</h2>
+                    <p>Donec id elit non mi porta gravida at eget metus. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus. Etiam porta sem malesuada magna mollis euismod. Donec sed odio dui. </p>
+                    <p><a class="btn" href="#">View details &raquo;</a></p>
+               </div>
+                <div class="span4">
+                    <h2>Heading</h2>
+                    <p>Donec sed odio dui. Cras justo odio, dapibus ac facilisis in, egestas eget quam. Vestibulum id ligula porta felis euismod semper. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus.</p>
+                    <p><a class="btn" href="#">View details &raquo;</a></p>
+                </div>
+            </div>
+
+            <hr>-->
+
+            <footer>
+              <hr>
+              <p style="text-align: center; veritcal-align: middle; color: #999;">
+                Apache Spark is an effort undergoing incubation at the Apache Software Foundation.
+                <a href="http://incubator.apache.org">
+                  <img style="margin-left: 20px;" src="img/incubator-logo.png" />
+                </a>
+              </p>
+            </footer>
+
+        </div> <!-- /container -->
+
+        <script src="js/vendor/jquery-1.8.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/main.js"></script>
+        
+        <!-- A script to fix internal hash links because we have an overlapping top bar.
+             Based on https://github.com/twitter/bootstrap/issues/193#issuecomment-2281510 -->
+        <script>
+          $(function() {
+            function maybeScrollToHash() {
+              if (window.location.hash && $(window.location.hash).length) {
+                var newTop = $(window.location.hash).offset().top - $('#topbar').height() - 5;
+                $(window).scrollTop(newTop);
+              }
+            }
+            $(window).bind('hashchange', function() {
+              maybeScrollToHash();
+            });
+            // Scroll now too in case we had opened the page on a hash, but wait 1 ms because some browsers
+            // will try to do *their* initial scroll after running the onReady handler.
+            setTimeout(function() { maybeScrollToHash(); }, 1)
+          })
+        </script>
+
+    </body>
+</html>

Added: dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/job-scheduling.html
==============================================================================
--- dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/job-scheduling.html (added)
+++ dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/job-scheduling.html Wed Sep 25 00:14:43 2013
@@ -0,0 +1,354 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        <title>Job Scheduling - Spark 0.8.0 Documentation</title>
+        <meta name="description" content="">
+
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <style>
+            body {
+                padding-top: 60px;
+                padding-bottom: 40px;
+            }
+        </style>
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
+        <link rel="stylesheet" href="css/main.css">
+
+        <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
+        
+        <link rel="stylesheet" href="css/pygments-default.css">
+
+        <!-- Google analytics script -->
+        <script type="text/javascript">
+          /*
+          var _gaq = _gaq || [];
+          _gaq.push(['_setAccount', 'UA-32518208-1']);
+          _gaq.push(['_trackPageview']);
+
+          (function() {
+            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+          })();
+          */
+        </script>
+
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
+
+        <div class="navbar navbar-fixed-top" id="topbar">
+            <div class="navbar-inner">
+                <div class="container">
+                    <div class="brand"><a href="index.html">
+                      <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">0.8.0</span>
+                    </div>
+                    <ul class="nav">
+                        <!--TODO(andyk): Add class="active" attribute to li some how.-->
+                        <li><a href="index.html">Overview</a></li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="quick-start.html">Quick Start</a></li>
+                                <li><a href="scala-programming-guide.html">Spark in Scala</a></li>
+                                <li><a href="java-programming-guide.html">Spark in Java</a></li>
+                                <li><a href="python-programming-guide.html">Spark in Python</a></li>
+                                <li class="divider"></li>
+                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+                                <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
+                                <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="api/core/index.html">Spark Core for Java/Scala</a></li>
+                                <li><a href="api/pyspark/index.html">Spark Core for Python</a></li>
+                                <li class="divider"></li>
+                                <li><a href="api/streaming/index.html">Spark Streaming</a></li>
+                                <li><a href="api/mllib/index.html">MLlib (Machine Learning)</a></li>
+                                <li><a href="api/bagel/index.html">Bagel (Pregel on Spark)</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="cluster-overview.html">Overview</a></li>
+                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
+                                <li><a href="spark-standalone.html">Standalone Mode</a></li>
+                                <li><a href="running-on-mesos.html">Mesos</a></li>
+                                <li><a href="running-on-yarn.html">YARN</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="configuration.html">Configuration</a></li>
+                                <li><a href="monitoring.html">Monitoring</a></li>
+                                <li><a href="tuning.html">Tuning Guide</a></li>
+                                <li><a href="hadoop-third-party-distributions.html">Running with CDH/HDP</a></li>
+                                <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
+                                <li><a href="job-scheduling.html">Job Scheduling</a></li>
+                                <li class="divider"></li>
+                                <li><a href="building-with-maven.html">Building Spark with Maven</a></li>
+                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+                            </ul>
+                        </li>
+                    </ul>
+                    <!--<p class="navbar-text pull-right"><span class="version-text">v0.8.0</span></p>-->
+                </div>
+            </div>
+        </div>
+
+        <div class="container" id="content">
+          <h1 class="title">Job Scheduling</h1>
+
+          <ul id="markdown-toc">
+  <li><a href="#overview">Overview</a></li>
+  <li><a href="#scheduling-across-applications">Scheduling Across Applications</a></li>
+  <li><a href="#scheduling-within-an-application">Scheduling Within an Application</a>    <ul>
+      <li><a href="#fair-scheduler-pools">Fair Scheduler Pools</a></li>
+      <li><a href="#default-behavior-of-pools">Default Behavior of Pools</a></li>
+      <li><a href="#configuring-pool-properties">Configuring Pool Properties</a></li>
+    </ul>
+  </li>
+</ul>
+
+<h1 id="overview">Overview</h1>
+
+<p>Spark has several facilities for scheduling resources between computations. First, recall that, as described
+in the <a href="cluster-overview.html">cluster mode overview</a>, each Spark application (instance of SparkContext)
+runs an independent set of executor processes. The cluster managers that Spark runs on provide
+facilities for <a href="#scheduling-across-applications">scheduling across applications</a>. Second,
+<em>within</em> each Spark application, multiple &#8220;jobs&#8221; (Spark actions) may be running concurrently
+if they were submitted by different threads. This is common if your application is serving requests
+over the network; for example, the <a href="http://shark.cs.berkeley.edu">Shark</a> server works this way. Spark
+includes a <a href="#scheduling-within-an-application">fair scheduler</a> to schedule resources within each SparkContext.</p>
+
+<h1 id="scheduling-across-applications">Scheduling Across Applications</h1>
+
+<p>When running on a cluster, each Spark application gets an independent set of executor JVMs that only
+run tasks and store data for that application. If multiple users need to share your cluster, there are
+different options to manage allocation, depending on the cluster manager.</p>
+
+<p>The simplest option, available on all cluster managers, is <em>static partitioning</em> of resources. With
+this approach, each application is given a maximum amount of resources it can use, and holds onto them
+for its whole duration. This is the approach used in Spark&#8217;s <a href="spark-standalone.html">standalone</a>
+and <a href="running-on-yarn.html">YARN</a> modes, as well as the
+<a href="running-on-mesos.html#mesos-run-modes">coarse-grained Mesos mode</a>.
+Resource allocation can be configured as follows, based on the cluster type:</p>
+
+<ul>
+  <li><strong>Standalone mode:</strong> By default, applications submitted to the standalone mode cluster will run in
+FIFO (first-in-first-out) order, and each application will try to use all available nodes. You can limit
+the number of nodes an application uses by setting the <code>spark.cores.max</code> system property in it. This
+will allow multiple users/applications to run concurrently. For example, you might launch a long-running
+server that uses 10 cores, and allow users to launch shells that use 20 cores each.
+Finally, in addition to controlling cores, each application&#8217;s <code>spark.executor.memory</code> setting controls
+its memory use.</li>
+  <li><strong>Mesos:</strong> To use static partitioning on Mesos, set the <code>spark.mesos.coarse</code> system property to <code>true</code>,
+and optionally set <code>spark.cores.max</code> to limit each application&#8217;s resource share as in the standalone mode.
+You should also set <code>spark.executor.memory</code> to control the executor memory.</li>
+  <li><strong>YARN:</strong> The <code>--num-workers</code> option to the Spark YARN client controls how many workers it will allocate
+on the cluster, while <code>--worker-memory</code> and <code>--worker-cores</code> control the resources per worker.</li>
+</ul>
+
+<p>A second option available on Mesos is <em>dynamic sharing</em> of CPU cores. In this mode, each Spark application
+still has a fixed and independent memory allocation (set by <code>spark.executor.memory</code>), but when the
+application is not running tasks on a machine, other applications may run tasks on those cores. This mode
+is useful when you expect large numbers of not overly active applications, such as shell sessions from
+separate users. However, it comes with a risk of less predictable latency, because it may take a while for
+an application to gain back cores on one node when it has work to do. To use this mode, simply use a
+<code>mesos://</code> URL without setting <code>spark.mesos.coarse</code> to true.</p>
+
+<p>Note that none of the modes currently provide memory sharing across applications. If you would like to share
+data this way, we recommend running a single server application that can serve multiple requests by querying
+the same RDDs. For example, the <a href="http://shark.cs.berkeley.edu">Shark</a> JDBC server works this way for SQL
+queries. In future releases, in-memory storage systems such as <a href="http://tachyon-project.org">Tachyon</a> will
+provide another approach to share RDDs.</p>
+
+<h1 id="scheduling-within-an-application">Scheduling Within an Application</h1>
+
+<p>Inside a given Spark application (SparkContext instance), multiple parallel jobs can run simultaneously if
+they were submitted from separate threads. By &#8220;job&#8221;, in this section, we mean a Spark action (e.g. <code>save</code>,
+<code>collect</code>) and any tasks that need to run to evaluate that action. Spark&#8217;s scheduler is fully thread-safe
+and supports this use case to enable applications that serve multiple requests (e.g. queries for
+multiple users).</p>
+
+<p>By default, Spark&#8217;s scheduler runs jobs in FIFO fashion. Each job is divided into &#8220;stages&#8221; (e.g. map and
+reduce phases), and the first job gets priority on all available resources while its stages have tasks to
+launch, then the second job gets priority, etc. If the jobs at the head of the queue don&#8217;t need to use
+the whole cluster, later jobs can start to run right away, but if the jobs at the head of the queue are
+large, then later jobs may be delayed significantly.</p>
+
+<p>Starting in Spark 0.8, it is also possible to configure fair sharing between jobs. Under fair sharing,
+Spark assigns tasks between jobs in a &#8220;round robin&#8221; fashion, so that all jobs get a roughly equal share
+of cluster resources. This means that short jobs submitted while a long job is running can start receiving
+resources right away and still get good response times, without waiting for the long job to finish. This
+mode is best for multi-user settings.</p>
+
+<p>To enable the fair scheduler, simply set the <code>spark.scheduler.mode</code> to <code>FAIR</code> before creating
+a SparkContext:</p>
+
+<div class="highlight"><pre><code class="scala"><span class="nc">System</span><span class="o">.</span><span class="n">setProperty</span><span class="o">(</span><span class="s">&quot;spark.scheduler.mode&quot;</span><span class="o">,</span> <span class="s">&quot;FAIR&quot;</span><span class="o">)</span>
+</code></pre></div>
+
+<h2 id="fair-scheduler-pools">Fair Scheduler Pools</h2>
+
+<p>The fair scheduler also supports grouping jobs into <em>pools</em>, and setting different scheduling options
+(e.g. weight) for each pool. This can be useful to create a &#8220;high-priority&#8221; pool for more important jobs,
+for example, or to group the jobs of each user together and give <em>users</em> equal shares regardless of how
+many concurrent jobs they have instead of giving <em>jobs</em> equal shares. This approach is modeled after the
+<a href="http://hadoop.apache.org/docs/stable/fair_scheduler.html">Hadoop Fair Scheduler</a>.</p>
+
+<p>Without any intervention, newly submitted jobs go into a <em>default pool</em>, but jobs&#8217; pools can be set by
+adding the <code>spark.scheduler.pool</code> &#8220;local property&#8221; to the SparkContext in the thread that&#8217;s submitting them.
+This is done as follows:</p>
+
+<div class="highlight"><pre><code class="scala"><span class="c1">// Assuming context is your SparkContext variable</span>
+<span class="n">context</span><span class="o">.</span><span class="n">setLocalProperty</span><span class="o">(</span><span class="s">&quot;spark.scheduler.pool&quot;</span><span class="o">,</span> <span class="s">&quot;pool1&quot;</span><span class="o">)</span>
+</code></pre></div>
+
+<p>After setting this local property, <em>all</em> jobs submitted within this thread (by calls in this thread
+to <code>RDD.save</code>, <code>count</code>, <code>collect</code>, etc) will use this pool name. The setting is per-thread to make
+it easy to have a thread run multiple jobs on behalf of the same user. If you&#8217;d like to clear the
+pool that a thread is associated with, simply call:</p>
+
+<div class="highlight"><pre><code class="scala"><span class="n">context</span><span class="o">.</span><span class="n">setLocalProperty</span><span class="o">(</span><span class="s">&quot;spark.scheduler.pool&quot;</span><span class="o">,</span> <span class="kc">null</span><span class="o">)</span>
+</code></pre></div>
+
+<h2 id="default-behavior-of-pools">Default Behavior of Pools</h2>
+
+<p>By default, each pool gets an equal share of the cluster (also equal in share to each job in the default
+pool), but inside each pool, jobs run in FIFO order. For example, if you create one pool per user, this
+means that each user will get an equal share of the cluster, and that each user&#8217;s queries will run in
+order instead of later queries taking resources from that user&#8217;s earlier ones.</p>
+
+<h2 id="configuring-pool-properties">Configuring Pool Properties</h2>
+
+<p>Specific pools&#8217; properties can also be modified through a configuration file. Each pool supports three
+properties:</p>
+
+<ul>
+  <li><code>schedulingMode</code>: This can be FIFO or FAIR, to control whether jobs within the pool queue up behind
+each other (the default) or share the pool&#8217;s resources fairly.</li>
+  <li><code>weight</code>: This controls the pool&#8217;s share of the cluster relative to other pools. By default, all pools
+have a weight of 1. If you give a specific pool a weight of 2, for example, it will get 2x more
+resources as other active pools. Setting a high weight such as 1000 also makes it possible to implement
+<em>priority</em> between pools&#8212;in essence, the weight-1000 pool will always get to launch tasks first
+whenever it has jobs active.</li>
+  <li><code>minShare</code>: Apart from an overall weight, each pool can be given a <em>minimum shares</em> (as a number of
+CPU cores) that the administrator would like it to have. The fair scheduler always attempts to meet
+all active pools&#8217; minimum shares before redistributing extra resources according to the weights.
+The <code>minShare</code> property can therefore be another way to ensure that a pool can always get up to a
+certain number of resources (e.g. 10 cores) quickly without giving it a high priority for the rest
+of the cluster. By default, each pool&#8217;s <code>minShare</code> is 0.</li>
+</ul>
+
+<p>The pool properties can be set by creating an XML file, similar to <code>conf/fairscheduler.xml.template</code>,
+and setting the <code>spark.scheduler.allocation.file</code> property:</p>
+
+<div class="highlight"><pre><code class="scala"><span class="nc">System</span><span class="o">.</span><span class="n">setProperty</span><span class="o">(</span><span class="s">&quot;spark.scheduler.allocation.file&quot;</span><span class="o">,</span> <span class="s">&quot;/path/to/file&quot;</span><span class="o">)</span>
+</code></pre></div>
+
+<p>The format of the XML file is simply a <code>&lt;pool&gt;</code> element for each pool, with different elements
+within it for the various settings. For example:</p>
+
+<div class="highlight"><pre><code class="xml"><span class="cp">&lt;?xml version=&quot;1.0&quot;?&gt;</span>
+<span class="nt">&lt;allocations&gt;</span>
+  <span class="nt">&lt;pool</span> <span class="na">name=</span><span class="s">&quot;production&quot;</span><span class="nt">&gt;</span>
+    <span class="nt">&lt;schedulingMode&gt;</span>FAIR<span class="nt">&lt;/schedulingMode&gt;</span>
+    <span class="nt">&lt;weight&gt;</span>1<span class="nt">&lt;/weight&gt;</span>
+    <span class="nt">&lt;minShare&gt;</span>2<span class="nt">&lt;/minShare&gt;</span>
+  <span class="nt">&lt;/pool&gt;</span>
+  <span class="nt">&lt;pool</span> <span class="na">name=</span><span class="s">&quot;test&quot;</span><span class="nt">&gt;</span>
+    <span class="nt">&lt;schedulingMode&gt;</span>FIFO<span class="nt">&lt;/schedulingMode&gt;</span>
+    <span class="nt">&lt;weight&gt;</span>2<span class="nt">&lt;/weight&gt;</span>
+    <span class="nt">&lt;minShare&gt;</span>3<span class="nt">&lt;/minShare&gt;</span>
+  <span class="nt">&lt;/pool&gt;</span>
+<span class="nt">&lt;/allocations&gt;</span>
+</code></pre></div>
+
+<p>A full example is also available in <code>conf/fairscheduler.xml.template</code>. Note that any pools not
+configured in the XML file will simply get default values for all settings (scheduling mode FIFO,
+weight 1, and minShare 0).</p>
+
+            <!-- Main hero unit for a primary marketing message or call to action -->
+            <!--<div class="hero-unit">
+                <h1>Hello, world!</h1>
+                <p>This is a template for a simple marketing or informational website. It includes a large callout called the hero unit and three supporting pieces of content. Use it as a starting point to create something more unique.</p>
+                <p><a class="btn btn-primary btn-large">Learn more &raquo;</a></p>
+            </div>-->
+
+            <!-- Example row of columns -->
+            <!--<div class="row">
+                <div class="span4">
+                    <h2>Heading</h2>
+                    <p>Donec id elit non mi porta gravida at eget metus. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus. Etiam porta sem malesuada magna mollis euismod. Donec sed odio dui. </p>
+                    <p><a class="btn" href="#">View details &raquo;</a></p>
+                </div>
+                <div class="span4">
+                    <h2>Heading</h2>
+                    <p>Donec id elit non mi porta gravida at eget metus. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus. Etiam porta sem malesuada magna mollis euismod. Donec sed odio dui. </p>
+                    <p><a class="btn" href="#">View details &raquo;</a></p>
+               </div>
+                <div class="span4">
+                    <h2>Heading</h2>
+                    <p>Donec sed odio dui. Cras justo odio, dapibus ac facilisis in, egestas eget quam. Vestibulum id ligula porta felis euismod semper. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus.</p>
+                    <p><a class="btn" href="#">View details &raquo;</a></p>
+                </div>
+            </div>
+
+            <hr>-->
+
+            <footer>
+              <hr>
+              <p style="text-align: center; veritcal-align: middle; color: #999;">
+                Apache Spark is an effort undergoing incubation at the Apache Software Foundation.
+                <a href="http://incubator.apache.org">
+                  <img style="margin-left: 20px;" src="img/incubator-logo.png" />
+                </a>
+              </p>
+            </footer>
+
+        </div> <!-- /container -->
+
+        <script src="js/vendor/jquery-1.8.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/main.js"></script>
+        
+        <!-- A script to fix internal hash links because we have an overlapping top bar.
+             Based on https://github.com/twitter/bootstrap/issues/193#issuecomment-2281510 -->
+        <script>
+          $(function() {
+            function maybeScrollToHash() {
+              if (window.location.hash && $(window.location.hash).length) {
+                var newTop = $(window.location.hash).offset().top - $('#topbar').height() - 5;
+                $(window).scrollTop(newTop);
+              }
+            }
+            $(window).bind('hashchange', function() {
+              maybeScrollToHash();
+            });
+            // Scroll now too in case we had opened the page on a hash, but wait 1 ms because some browsers
+            // will try to do *their* initial scroll after running the onReady handler.
+            setTimeout(function() { maybeScrollToHash(); }, 1)
+          })
+        </script>
+
+    </body>
+</html>

Added: dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/js/main.js
==============================================================================
--- dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/js/main.js (added)
+++ dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/js/main.js Wed Sep 25 00:14:43 2013
@@ -0,0 +1 @@
+

Propchange: dev/incubator/spark/spark-0.8.0-incubating-rc6-docs/js/main.js
------------------------------------------------------------------------------
    svn:executable = *