You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by ph...@apache.org on 2008/09/25 03:05:15 UTC

svn commit: r698787 [3/4] - in /hadoop/zookeeper/trunk: docs/ docs/images/ src/docs/ src/docs/src/documentation/content/xdocs/ src/docs/src/documentation/resources/images/

Modified: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/index.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/index.xml?rev=698787&r1=698786&r2=698787&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/index.xml (original)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/index.xml Wed Sep 24 18:05:14 2008
@@ -20,20 +20,28 @@
 <document>
   
   <header>
-    <title>ZooKeeper Documentation</title>
+    <title>ZooKeeper: Because Coordinating Distributed Systems is a Zoo</title>
   </header>
   
   <body>
     <p>
-    The following documents provide concepts and procedures that will help you 
-    get started using ZooKeeper. If you have more questions, you can ask the 
-    <a href="ext:lists">mailing list</a> or browse the archives.
+ZooKeeper is a high-performance coordination service for distributed applications. It exposes common services - such as naming, configuration management, synchronization, and group services - in a simple interface so you don't have to write them from scratch. You can use it off-the-shelf to implement consensus, group management, leader election, and presence protocols. And you can build on it for your own, specific needs.
+</p>
+
+<p>
+The following documents provide concepts and procedures to get you started using ZooKeeper. If you have more questions, please ask the <a href="ext:lists">mailing list</a> or browse the archives.
     </p>
     <ul>
-      <li><a href="ext:api/started">Getting Started</a></li>
-      <li><a href="ext:api/index">API Docs</a></li>
-      <li><a href="ext:wiki">Wiki</a></li>
-      <li><a href="ext:faq">FAQ</a></li>
+
+      <li><a href="zookeeperOver.html">Overview</a> - a bird's eye view of ZooKeeper, including design concepts and architecture</li>
+      <li><a href="zookeeperStarted.html">Getting Started</a> - a tutorial-style guide for developers to install, run, and program to ZooKeeper</li>
+      <li><a href="zookeeperProgrammers.html">Programmer's Guide</a> - an application developer's guide to ZooKeeper</li>
+      <li><a href="recipes.html">ZooKeeper Recipes</a> - a set of common, higher level solutions using ZooKeeper</li>
+      <li><a href="zookeeperAdmin.html">Administrator's Guide</a> - a guide for system administrators and anyone else who might deploy Zookeeer</li>
+      <li><a href="ext:api/index">API Docs</a> - the technical reference to ZooKeeper APIs</li>
+      <li><a href="ext:wiki">Wiki</a> - miscellaneous, informal ZooKeeper documentation, in Wiki format</li>
+      <li><a href="ext:faq">FAQ</a> - frequently asked questions</li>    
+
     </ul>
   </body>
   

Added: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/recipes.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/recipes.xml?rev=698787&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/recipes.xml (added)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/recipes.xml Wed Sep 24 18:05:14 2008
@@ -0,0 +1,623 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright 2002-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
+"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<book id="ar_Recipes">
+  <title>ZooKeeper Recipes and Solutions</title>
+
+  <bookinfo>
+    <legalnotice>
+      <para>Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License. You may
+      obtain a copy of the License at <ulink
+      url="http://www.apache.org/licenses/LICENSE-2.0">http://www.apache.org/licenses/LICENSE-2.0</ulink>.</para>
+
+      <para>Unless required by applicable law or agreed to in writing,
+      software distributed under the License is distributed on an "AS IS"
+      BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied. See the License for the specific language governing permissions
+      and limitations under the License.</para>
+    </legalnotice>
+
+    <abstract>
+      <para>This guide contains pseudocode and guidelines for using Zookeeper to
+      solve common problems in Distributed Application Coordination. It
+      discusses such problems as event handlers, queues, and locks..</para>
+
+      <para>$Revision: 1.6 $ $Date: 2008/09/19 03:46:18 $</para>
+    </abstract>
+  </bookinfo>
+
+  <chapter id="ch_recipes">
+    <title>A Guide to Creating Higher-level Constructs with ZooKeeper</title>
+
+    <para>In this article, you'll find guidelines for using
+    ZooKeeper to implement higher order functions. All of them are conventions
+    implemented at the client and do not require special support from
+    ZooKeeper. Hopfully the community will capture these conventions in client-side libraries 
+    to ease their use and to encourage standardization.</para>
+
+    <para>One of the most interesting things about ZooKeeper is that even
+    though ZooKeeper uses <emphasis>asynchronous</emphasis> notifications, you
+    can use it to build <emphasis>synchronous</emphasis> consistency
+    primitives, such as queues and locks. As you will see, this is possible
+    because ZooKeeper imposes an overall order on updates, and has mechanisms
+    to expose this ordering.</para>
+
+    <para>Note that the recipes below attempt to employ best practices. In
+    particular, they avoid polling, timers or anything else that would result
+    in a "herd effect", causing bursts of traffic and limiting
+    scalability.</para>
+
+    <para>There are many useful functions that can be imagined that aren't
+    included here - revocable read-write priority locks, as just one example.
+    And some of the constructs mentioned here - locks, in particular -
+    illustrate certain points, even though you may find other constructs, such
+    as event handles or queues, a more practical means of performing the same
+    function. In general, the examples in this section are designed to
+    stimulate thought.</para>
+
+
+  <section id="sc_outOfTheBox">
+    <title>Out of the Box Applications: Name Service, Configuration, Group
+    Membership</title>
+
+    <para>Name service and configuration are two of the primary applications
+    of ZooKeeper. These two functions are provided directly by the ZooKeeper
+    API.</para>
+
+    <para>Another function directly provided by ZooKeeper is <emphasis>group
+    membership</emphasis>. The group is represented by a node. Members of the
+    group create ephemeral nodes under the group node. Nodes of the members
+    that fail abnormally will be removed automatically when ZooKeeper detects
+    the failure.</para>
+  </section>
+
+  <section id="sc_recipes_eventHandles">
+    <title>Barriers</title>
+
+    <para>Distributed systems use <firstterm>barriers</firstterm> to block
+    processing of a set of nodes until a condition is met at which time all
+    the nodes are allowed to proceed. Barriers are implemented in ZooKeeper by
+    designating a barrier node. The barrier is in place if the barrier node
+    exists. Here's the pseudo code:</para>
+
+    <orderedlist>
+      <listitem>
+        <para>Client calls the ZooKeeper API's <emphasis
+        role="bold">exists()</emphasis> function on the barrier node, with
+        <emphasis>watch</emphasis> set to true.</para>
+      </listitem>
+
+      <listitem>
+        <para>If <emphasis role="bold">exists()</emphasis> returns false, the
+        barrier is gone and the client proceeds</para>
+      </listitem>
+
+      <listitem>
+        <para>Else, if <emphasis role="bold">exists()</emphasis> returns true,
+        the clients wait for a watch event from ZooKeeper for the barrier
+        node.</para>
+      </listitem>
+
+      <listitem>
+        <para>When the watch event is triggered, the client reissues the
+        <emphasis role="bold">exists( )</emphasis> call, again waiting until
+        the barrier node is removed.</para>
+      </listitem>
+    </orderedlist>
+
+    <para><remark>[tbd: maybe an illustration would be nice for each of the
+    recipes?]</remark></para>
+
+    <section id="sc_doubleBarriers">
+      <title>Double Barriers</title>
+
+      <para>Double barriers enable clients to synchronize the beginning and
+      the end of a computation. When enough processes have joined the barrier,
+      processes start their computation and leave the barrier once they have
+      finished. This recipe shows how to use a ZooKeeper node as a
+      barrier.</para>
+
+      <para>The pseudo code in this recipe represents the barrier node as
+      <emphasis>b</emphasis>. Every client process <emphasis>p</emphasis>
+      registers with the barrier node on entry and unregisters when it is
+      ready to leave. A node registers with the barrier node via the <emphasis
+      role="bold">Enter</emphasis> procedure below, it waits until
+      <emphasis>x</emphasis> client process register before proceeding with
+      the computation. (The <emphasis>x</emphasis> here is up to you to
+      determine for your system.)</para>
+
+      <para><informaltable colsep="0" frame="none" rowsep="0">
+          <tgroup cols="2">
+            <tbody>
+              <row>
+                <entry align="center"><emphasis
+                role="bold">Enter</emphasis></entry>
+
+                <entry align="center"><emphasis
+                role="bold">Leave</emphasis></entry>
+              </row>
+
+              <row>
+                <entry align="left"><orderedlist>
+                    <listitem>
+                      <para>Create a name <emphasis><emphasis>n</emphasis> =
+                      <emphasis>b</emphasis>+“/”+<emphasis>p</emphasis></emphasis></para>
+                    </listitem>
+
+                    <listitem>
+                      <para>Set watch: <emphasis
+                      role="bold">exists(<emphasis>b</emphasis> + ‘‘/ready’’,
+                      true)</emphasis></para>
+                    </listitem>
+
+                    <listitem>
+                      <para>Create child: <emphasis role="bold">create(
+                      <emphasis>n</emphasis>, EPHEMERAL)</emphasis></para>
+                    </listitem>
+
+                    <listitem>
+                      <para><emphasis role="bold">L = getChildren(b,
+                      false)</emphasis></para>
+                    </listitem>
+
+                    <listitem>
+                      <para>if fewer children in L than<emphasis>
+                      x</emphasis>, wait for watch event <remark>[tbd: how do
+                      you wait?]</remark></para>
+                    </listitem>
+
+                    <listitem>
+                      <para>else <emphasis role="bold">create(b + ‘‘/ready’’,
+                      REGULAR)</emphasis></para>
+                    </listitem>
+                  </orderedlist></entry>
+
+                <entry><orderedlist>
+                    <listitem>
+                      <para><emphasis role="bold">L = getChildren(b,
+                      false)</emphasis></para>
+                    </listitem>
+
+                    <listitem>
+                      <para>if no children, exit</para>
+                    </listitem>
+
+                    <listitem>
+                      <para>if <emphasis>p</emphasis> is only process node in
+                      L, delete(n) and exit</para>
+                    </listitem>
+
+                    <listitem>
+                      <para>if <emphasis>p</emphasis> is the lowest process
+                      node in L, wait on highest process node in P</para>
+                    </listitem>
+
+                    <listitem>
+                      <para>else <emphasis
+                      role="bold">delete(<emphasis>n</emphasis>) </emphasis>if
+                      still exists and wait on lowest process node in L</para>
+                    </listitem>
+
+                    <listitem>
+                      <para>goto 1</para>
+                    </listitem>
+                  </orderedlist></entry>
+              </row>
+            </tbody>
+          </tgroup>
+        </informaltable>On entering, all processes watch on a ready node and
+      create an ephemeral node as a child of the barrier node. Each process
+      but the last enters the barrier and waits for the ready node to appear
+      at line 5. The process that creates the xth node, the last process, will
+      see x nodes in the list of children and create the ready node, waking up
+      the other processes. Note that waiting processes wake up only when it is
+      time to exit, so waiting is efficient.</para>
+
+      <para>On exit, you can't use a flag such as <emphasis>ready</emphasis>
+      because you are watching for process nodes to go away. By using
+      ephemeral nodes, processes that fail after the barrier has been entered
+      do not prevent correct processes from finishing. When processes are
+      ready to leave, they need to delete their process nodes and wait for all
+      other processes to do the same.</para>
+
+      <para>Processes exit when there are no process nodes left as children of
+      <emphasis>b</emphasis>. However, as an efficiency, you can use the
+      lowest process node as the ready flag. All other processes that are
+      ready to exit watch for the lowest existing process node to go away, and
+      the owner of the lowest process watches for any other process node
+      (picking the highest for simplicity) to go away. This means that only a
+      single process wakes up on each node deletion except for the last node,
+      which wakes up everyone when it is removed.</para>
+    </section>
+  </section>
+
+  <section id="sc_recipes_Queues">
+    <title>Queues</title>
+
+    <para>Distributed queues are a common data structure. To implement a
+    distributed queue in ZooKeeper, first designate a znode to hold the queue,
+    the queue node. The distributed clients put something into the queue by
+    calling create() with a pathname ending in "queue-", with the
+    <emphasis>sequence</emphasis> and <emphasis>ephemeral</emphasis> flags in
+    the create() call set to true. Because the <emphasis>sequence</emphasis>
+    flag is set, the new pathnames will have the form
+    _path-to-queue-node_/queue-X, where X is a monotonic increasing number. A
+    client that wants to be remove from the queue calls ZooKeeper's <emphasis
+    role="bold">getChildren( )</emphasis> function, with
+    <emphasis>watch</emphasis> set to true on the queue node, and begins
+    processing nodes with the lowest number. The client does not need to issue
+    another <emphasis role="bold">getChildren( )</emphasis> until it exhausts
+    the list obtained from the first <emphasis role="bold">getChildren(
+    )</emphasis> call. If there are are no children in the queue node, the
+    reader waits for a watch notification to check to queue again.</para>
+
+    <section id="sc_recipes_priorityQueues">
+      <title>Priority Queues</title>
+
+      <para>To implement a priority queue, you need only make two simple
+      changes to the generic <ulink url="#sc_recipes_Queues">queue
+      recipe</ulink> . First, to add to a queue, the pathname ends with
+      "queue-YY" where YY is the priority of the element with lower numbers
+      representing higher priority (just like UNIX). Second, when removing
+      from the queue a client uses an up-to-date children list meaning that
+      the client will invalidate previously obtained children lists if a watch
+      notification triggers for the queue node.</para>
+    </section>
+  </section>
+
+  <section id="sc_recipes_Locks">
+    <title>Locks</title>
+
+    <para>Fully distributed locks that are globally synchronous, meaning at
+    any snapshot in time no two clients think they hold the same lock. These
+    can be implemented using ZooKeeeper. As with priority queues, first define
+    a lock node.</para>
+
+    <para>Clients wishing to obtain a lock do the following:</para>
+
+    <orderedlist>
+      <listitem>
+        <para>Call <emphasis role="bold">create( )</emphasis> with a pathname
+        of "_locknode_/lock-" and the <emphasis>sequence</emphasis> and
+        <emphasis>ephemeral</emphasis> flags set.</para>
+      </listitem>
+
+      <listitem>
+        <para>Call <emphasis role="bold">getChildren( )</emphasis> on the lock
+        node <emphasis>without</emphasis> setting the watch flag (this is
+        important to avoid the herd effect).</para>
+      </listitem>
+
+      <listitem>
+        <para>If the pathname created in step <emphasis
+        role="bold">1</emphasis> has the lowest sequence number suffix, the
+        client has the lock and the client exits the protocol.</para>
+      </listitem>
+
+      <listitem>
+        <para>The client calls <emphasis role="bold">exists( )</emphasis> with
+        the watch flag set on the path in the lock directory with the next
+        lowest sequence number.</para>
+      </listitem>
+
+      <listitem>
+        <para>if <emphasis role="bold">exists( )</emphasis> returns false, go
+        to step <emphasis role="bold">2</emphasis>. Otherwise, wait for a
+        notification for the pathname from the previous step before going to
+        step <emphasis role="bold">2</emphasis>.</para>
+      </listitem>
+    </orderedlist>
+
+    <para>The unlock protocol is very simple: clients wishing to release a
+    lock simply delete the node they created in step 1.</para>
+
+    <para>Here are a few things to notice:</para>
+
+    <itemizedlist>
+      <listitem>
+        <para>The removal of a node will only cause one client to wake up
+        since each node is watched by exactly one client. In this way, you
+        avoid the herd effect.</para>
+      </listitem>
+    </itemizedlist>
+
+    <itemizedlist>
+      <listitem>
+        <para>There is no polling or timeouts.</para>
+      </listitem>
+    </itemizedlist>
+
+    <itemizedlist>
+      <listitem>
+        <para>Because of the way you implement locking, it is easy to see the
+        amount of lock contention, break locks, debug locking problems,
+        etc.</para>
+      </listitem>
+    </itemizedlist>
+
+    <section>
+      <title>Shared Locks</title>
+
+      <para>You can implement shared locks by with a few changes to the lock
+      protocol:</para>
+
+      <informaltable colsep="0" frame="none" rowsep="0">
+        <tgroup cols="2">
+          <tbody>
+            <row>
+              <entry align="center"><emphasis role="bold">Obtaining a read
+              lock:</emphasis></entry>
+
+              <entry align="center"><emphasis role="bold">Obtaining a write
+              lock:</emphasis></entry>
+            </row>
+
+            <row>
+              <entry align="left"><orderedlist>
+                  <listitem>
+                    <para>Call <emphasis role="bold">create( )</emphasis> to
+                    create a node with pathname
+                    "<parameter>_locknode_/read-</parameter>". This is the
+                    lock node use later in the protocol. Make sure to set both
+                    the <emphasis>sequence</emphasis> and
+                    <emphasis>ephemeral</emphasis> flags.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>Call <emphasis role="bold">getChildren( )</emphasis>
+                    on the lock node <emphasis>without</emphasis> setting the
+                    <emphasis>watch</emphasis> flag - this is important, as it
+                    avoids the herd effect.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>If there are no children with a pathname starting
+                    with "<parameter>write-</parameter>" and having a lower
+                    sequence number than the node created in step <emphasis
+                    role="bold">1</emphasis>, the client has the lock and can
+                    exit the protocol. </para>
+                  </listitem>
+
+                  <listitem>
+                    <para>Otherwise, call <emphasis role="bold">exists(
+                    )</emphasis>, with <emphasis>watch</emphasis> flag, set on
+                    the node in lock directory with pathname staring with
+                    "<parameter>write-</parameter>" having the next lowest
+                    sequence number.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>If <emphasis role="bold">exists( )</emphasis>
+                    returns <emphasis>false</emphasis>, goto step <emphasis
+                    role="bold">2</emphasis>.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>Otherwise, wait for a notification for the pathname
+                    from the previous step before going to step <emphasis
+                    role="bold">2</emphasis></para>
+                  </listitem>
+                </orderedlist></entry>
+
+              <entry><orderedlist>
+                  <listitem>
+                    <para>Call <emphasis role="bold">create( )</emphasis> to
+                    create a node with pathname
+                    "<parameter>_locknode_/write-</parameter>". This is the
+                    lock node spoken of later in the protocol. Make sure to
+                    set both <emphasis>sequence</emphasis> and
+                    <emphasis>ephemeral</emphasis> flags.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>Call <emphasis role="bold">getChildren( )
+                    </emphasis> on the lock node <emphasis>without</emphasis>
+                    setting the <emphasis>watch</emphasis> flag - this is
+                    important, as it avoids the herd effect.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>If there are no children with a lower sequence
+                    number than the node created in step <emphasis
+                    role="bold">1</emphasis>, the client has the lock and the
+                    client exits the protocol.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>Call <emphasis role="bold">exists( ),</emphasis>
+                    with <emphasis>watch</emphasis> flag set, on the node with
+                    the pathname that has the next lowest sequence
+                    number.</para>
+                  </listitem>
+
+                  <listitem>
+                    <para>If <emphasis role="bold">exists( )</emphasis>
+                    returns <emphasis>false</emphasis>, goto step <emphasis
+                    role="bold">2</emphasis>. Otherwise, wait for a
+                    notification for the pathname from the previous step
+                    before going to step <emphasis
+                    role="bold">2</emphasis>.</para>
+                  </listitem>
+                </orderedlist></entry>
+            </row>
+          </tbody>
+        </tgroup>
+      </informaltable>
+
+      <para><note>
+          <para>It might appear that this recipe creates a herd effect: when
+          there is a large group of clients waiting for a read lock, and all
+          getting notified more or less simultaneously when the
+          "<parameter>write-</parameter>" node with the lowest sequence number
+          is deleted. In fact. that's valid behavior: as all those waiting
+          reader clients should be released since they have the lock. The herd
+          effect refers to releasing a "herd" when in fact only a single or a
+          small number of machines can proceed. <remark>[tbd: myabe helpful to
+          indicate which step this refers to?]</remark></para>
+        </note></para>
+    </section>
+
+    <section id="sc_recoverableSharedLocks">
+      <title>Recoverable Shared Locks</title>
+
+      <para>With minor modifications to the Shared Lock protocol, you make
+      shared locks revocable by modifying the shared lock protocol:</para>
+
+      <para>In step <emphasis role="bold">1</emphasis>, of both obtain reader
+      and writer lock protocols, call <emphasis role="bold">getData(
+      )</emphasis> with <emphasis>watch</emphasis> set, immediately after the
+      call to <emphasis role="bold">create( )</emphasis>. If the client
+      subsequently receives notification for the node it created in step
+      <emphasis role="bold">1</emphasis>, it does another <emphasis
+      role="bold">getData( )</emphasis> on that node, with
+      <emphasis>watch</emphasis> set and looks for the string "unlock", which
+      signals to the client that it must release the lock. This is because,
+      according to this shared lock protocol, you can request the client with
+      the lock give up the lock by calling <emphasis role="bold">setData()
+      </emphasis> on the lock node, writing "unlock" to that node.</para>
+
+      <para>Note that this protocol requires the lock holder to consent to
+      releasing the lock. Such consent is important, especially if the lock
+      holder needs to do some processing before releasing the lock. Of course
+      you can always implement <emphasis>Revocable Shared Locks with Freaking
+      Laser Beams</emphasis> by stipulating in your protocol that the revoker
+      is allowed to delete the lock node if after some length of time the lock
+      isn't deleted by the lock holder.</para>
+    </section>
+  </section>
+
+  <section id="sc_recipes_twoPhasedCommit">
+    <title>Two-phased Commit</title>
+
+    <para>A two-phase commit protocol is an algorithm that lets all clients in
+    a distributed system agree either to commit a transaction or abort.</para>
+
+    <para>In ZooKeeper, you can implement a two-phased commit by having a
+    coordinator create a transaction node, say "/app/Tx", and one child node
+    per participating site, say "/app/Tx/s_i". When coordinator creates the
+    child node, it leaves the content undefined. Once each site involved in
+    the transaction receives the transaction from the coordinator, the site
+    reads each child node and sets a watch. Each site then processes the query
+    and votes "commit" or "abort" by writing to its respective node. Once the
+    write completes, the other sites are notified, and as soon as all sites
+    have all votes, they can decide either "abort" or "commit". Note that a
+    node can decide "abort" earlier if some site votes for "abort".</para>
+
+    <para>An interesting aspect of this implementation is that the only role
+    of the coordinator is to decide upon the group of sites, to create the
+    ZooKeeper nodes, and to propagate the transaction to the corresponding
+    sites. In fact, even propagating the transaction can be done through
+    ZooKeeper by writing it in the transaction node.</para>
+
+    <para>There are two important drawbacks of the approach described above.
+    One is the message complexity, which is O(n²). The second is the
+    impossibility of detecting failures of sites through ephemeral nodes. To
+    detect the failure of a site using ephemeral nodes, it is necessary that
+    the site create the node.</para>
+
+    <para>To solve the first problem, you can have only the coordinator
+    notified of changes to the transaction nodes, and then notify the sites
+    once coordinator reaches a decision. Note that this approach is scalable,
+    but it's is slower too, as it requires all communication to go through the
+    coordinator.</para>
+
+    <para>To address the second problem, you can have the coordinator
+    propagate the transaction to the sites, and have each site creating its
+    own ephemeral node.</para>
+  </section>
+
+  <section id="sc_leaderElection">
+    <title>Leader Election</title>
+
+    <para>A simple way of doing leader election with ZooKeeper is to use the
+    <emphasis role="bold">SEQUENCE|EPHEMERAL</emphasis> flags when creating
+    znodes that represent "proposals" of clients. The idea is to have a znode,
+    say "/election", such that each znode creates a child znode "/election/n_"
+    with both flags SEQUENCE|EPHEMERAL. With the sequence flag, ZooKeeper
+    automatically appends a sequence number that is greater that any one
+    previously appended to a child of "/election". The process that created
+    the znode with the smallest appended sequence number is the leader.
+    </para>
+
+    <para>That's not all, though. It is important to watch for failures of the
+    leader, so that a new client arises as the new leader in the case the
+    current leader fails. A trivial solution is to have all application
+    processes watching upon the current smallest znode, and checking if they
+    are the new leader when the smallest znode goes away (note that the
+    smallest znode will go away if the leader fails because the node is
+    ephemeral). But this causes a herd effect: upon of failure of the current
+    leader, all other processes receive a notification, and execute
+    getChildren on "/election" to obtain the current list of children of
+    "/election". If the number of clients is large, it causes a spike on the
+    number of operations that ZooKeeper servers have to process. To avoid the
+    herd effect, it is sufficient to watch for the next znode down on the
+    sequence of znodes. If a client receives a notification that the znode it
+    is watching is gone, then it becomes the new leader in the case that there
+    is no smaller znode. Note that this avoids the herd effect by not having
+    all clients watching the same znode. </para>
+
+    <para>Here's the pseudo code:</para>
+
+    <para>Let ELECTION be a path of choice of the application. To volunteer to
+    be a leader: </para>
+
+    <orderedlist>
+      <listitem>
+        <para>Create znode z with path "ELECTION/n_" with both SEQUENCE and
+        EPHEMERAL flags;</para>
+      </listitem>
+
+      <listitem>
+        <para>Let C be the children of "ELECTION", and i be the sequence
+        number of z;</para>
+      </listitem>
+
+      <listitem>
+        <para>Watch for changes on "ELECTION/n_j", where j is the smallest
+        sequence number such that j &lt; i and n_j is a znode in C;</para>
+      </listitem>
+    </orderedlist>
+
+    <para>Upon receiving a notification of znode deletion: </para>
+
+    <orderedlist>
+      <listitem>
+        <para>Let C be the new set of children of ELECTION; </para>
+      </listitem>
+
+      <listitem>
+        <para>If z is the smallest node in C, then execute leader
+        procedure;</para>
+      </listitem>
+
+      <listitem>
+        <para>Otherwise, watch for changes on "ELECTION/n_j", where j is the
+        smallest sequence number such that j &lt; i and n_j is a znode in C;
+        </para>
+      </listitem>
+    </orderedlist>
+
+    <para>Note that the znode having no preceding znode on the list of
+    children does not imply that the creator of this znode is aware that it is
+    the current leader. Applications may consider creating a separate to znode
+    to acknowledge that the leader has executed the leader procedure. </para>
+  </section>
+  </chapter>
+</book>

Propchange: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/recipes.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/site.xml?rev=698787&r1=698786&r2=698787&view=diff
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/site.xml (original)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/site.xml Wed Sep 24 18:05:14 2008
@@ -32,12 +32,17 @@
 <site label="Hadoop" href="" xmlns="http://apache.org/forrest/linkmap/1.0">
 
   <docs label="Documentation"> 
-    <overview  label="Overview"           href="index.html" />
-    <started   label="Getting Started"    href="ext:api/started" />
-    <api       label="API Docs"           href="ext:api/index" />
-    <wiki      label="Wiki"               href="ext:wiki" />
-    <faq       label="FAQ"                href="ext:faq" />
-    <lists     label="Mailing Lists"      href="ext:lists" />
+    <welcome   label="Welcome"                href="index.html" />
+    <overview  label="Zookeeper Overview"     href="zookeeperOver.html" />
+    <started   label="Getting Started"        href="zookeeperStarted.html" />
+    <program   label="Programmer's Guide"     href="zookeeperProgrammers.html" />
+    <recipes   label="Recipes"		      href="recipes.html" />
+    <admin     label="Administrator's Guide"  href="zookeeperAdmin.html" />
+    <api       label="API Docs"               href="ext:api/index" />
+    <wiki      label="Wiki"                   href="ext:wiki" />
+    <faq       label="FAQ"                    href="ext:faq" />
+    <lists     label="Mailing Lists"          href="ext:lists" />
+    <other     label="Other Info"	      href="zookeeperOtherInfo.html" />
   </docs>
 
   <external-refs>

Added: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml?rev=698787&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml (added)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml Wed Sep 24 18:05:14 2008
@@ -0,0 +1,827 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright 2002-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
+"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<book id="bk_Admin">
+  <title>ZooKeeper Administrator's Guide</title>
+
+  <subtitle>A Guide to Deployment and Administration</subtitle>
+
+  <bookinfo>
+    <legalnotice>
+      <para>Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License. You may
+      obtain a copy of the License at <ulink
+      url="http://www.apache.org/licenses/LICENSE-2.0">http://www.apache.org/licenses/LICENSE-2.0</ulink>.</para>
+
+      <para>Unless required by applicable law or agreed to in writing,
+      software distributed under the License is distributed on an "AS IS"
+      BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied. See the License for the specific language governing permissions
+      and limitations under the License.</para>
+    </legalnotice>
+
+    <abstract>
+      <para>This document contains information about deploying, administering
+      and mantaining ZooKeeper. It also discusses best practices and common
+      problems.</para>
+
+      <para>$Revision: 1.7 $ $Date: 2008/09/19 05:29:31 $</para>
+    </abstract>
+  </bookinfo>
+
+  <chapter id="ch_deployment">
+    <title>Deployment</title>
+
+    <para>This chapter contains information about deploying Zookeeper and
+    covers these topics:</para>
+
+    <itemizedlist>
+      <listitem>
+        <para><xref linkend="sc_systemReq"/></para>
+      </listitem>
+
+      <listitem>
+        <para><xref linkend="sc_zkMulitServerSetup"/></para>
+      </listitem>
+
+      <listitem>
+        <para><xref linkend="sc_singleAndDevSetup"/></para>
+      </listitem>
+    </itemizedlist>
+
+    <para>The first two sections assume you are interested in installing
+    Zookeeper in a production environment such as a datacenter. The final
+    section covers situations in which you are setting up Zookeeper on a
+    limited basis - for evaluation, testing, or development - but not in a
+    production environment.</para>
+
+    <section id="sc_systemReq">
+      <title>System Requirements</title>
+
+      <para>Zookeeper runs in Java, release 1.6 or greater, as group of hosts
+      called a quorum. Three Zookeeper hosts per quorum is the minimum
+      recommended quorum size. At Yahoo!, Zookeeper is usually deployed on
+      dedicated RHEL boxes, with dual-core processors, 2GB of RAM, and 80GB
+      IDE harddrives.</para>
+    </section>
+
+    <section id="sc_zkMulitServerSetup">
+      <title>Clustered (Multi-Server) Setup</title>
+
+      <para>For reliable ZooKeeper service, you should deploy ZooKeeper in a
+      cluster known as a <firstterm>quorum</firstterm>. As long as a majority
+      of the quorum are up, the service will be available. Because Zookeeper
+      requires a majority <remark>[tbd: why?]</remark>, it is best to use an
+      odd number of machines. For example, with four machines ZooKeeper can
+      only handle the failure of a single machine; if two machines fail, the
+      remaining two machines do not constitute a majority. However, with five
+      machines ZooKeeper can handle the failure of two machines. </para>
+
+      <para>Here are the steps to setting a server that will be part of a
+      quorum. These steps should be performed on every host in the
+      quorum:</para>
+
+      <orderedlist>
+        <listitem>
+          <para>Install the Java JDK:</para>
+
+          <screen>$yinst -i jdk-1.6.0.00_3 -br test  <remark>[y! prop - replace with open equiv]</remark></screen>
+        </listitem>
+
+        <listitem>
+          <para>Set the Java heap size. This is very important, to avoid
+          swapping, which will seriously degrade Zookeeper performance. To
+          determine the correct value, load tests, make sure you are well
+          below the usage limit that would cause you to swap. Be conservative
+          - use a maximum heap size of 3GB for a 4GB machine. <remark>[tbd:
+          where would they do this? Environment variable,
+          etc?]</remark></para>
+        </listitem>
+
+        <listitem>
+          <para>Install the Zookeeper Server Package:</para>
+
+          <screen>$ yinst install -nostart zookeeper_server <remark>[Y! prop - replace with open eq]</remark></screen>
+        </listitem>
+
+        <listitem>
+          <para>Create a configuration file. This file can be called anything.
+          Use the following settings as a starting point:</para>
+
+          <screen>
+tickTime=2000
+dataDir=/var/zookeeper/
+clientPort=2181
+initLimit=5
+syncLimit=2
+server.1=zoo1:2888
+server.2=zoo2:2888
+server.3=zoo3:2888</screen>
+
+          <para>You can find the meanings of these and other configuration
+          settings in the section <xref linkend="sc_configuration" />. A word
+          though about a few here:</para>
+
+          <para>Every machine that is part of the ZooKeeper quorum should know
+          about every other machine in the quorum. You accomplish this with
+          the series of lines of the form <emphasis
+          role="bold">server.id=host:port</emphasis>. The integers <emphasis
+          role="bold">host</emphasis> and <emphasis
+          role="bold">port</emphasis> are straightforward. You attribute the
+          server id to each machine by creating a file named
+          <filename>myid</filename>, one for each server, which resides in
+          that server's data directory, as specified by the configuration file
+          parameter <emphasis role="bold">dataDir</emphasis>. The myid file
+          consists of a single line containing only the text of that machine's
+          id. So <filename>myid</filename> of server 1 would contain the text
+          "1" and nothing else. The id must be unique within the
+          quorum.</para>
+        </listitem>
+
+        <listitem>
+          <para>If your configuration file is set up, you can start
+          Zookeeper:</para>
+
+          <screen>$ java -cp zookeeper-dev.jar:java/lib/log4j-1.2.15.jar:conf \
+        org.apache.zookeeper.server.quorum.QuorumPeerMain zoo.cfg</screen>
+        </listitem>
+
+        <listitem>
+          <para>Test your deployment by connecting to the hosts:</para>
+
+          <itemizedlist>
+            <listitem>
+              <para>In Java, you can run the following command to execute
+              simple operations:<remark> [tbd: also, maybe give some of those
+              simple operations?]</remark></para>
+
+              <screen>$ java -cp zookeeper.jar:java/lib/log4j-1.2.15.jar:conf \
+      org.apache.zookeeper.ZooKeeperMain 127.0.0.1:2181</screen>
+            </listitem>
+
+            <listitem>
+              <para>In C, you can compile either the single threaded client or
+              the multithreaded client: or n the c subdirectory in the
+              Zookeeper sources. This compiles the single threaded
+              client:</para>
+
+              <screen>$ _make cli_st_</screen>
+
+              <para>And this compiles the mulithreaded client:</para>
+
+              <screen>$ _make cli_mt_</screen>
+            </listitem>
+          </itemizedlist>
+
+          <para>Running either program gives you a shell in which to execute
+          simple file-system-like operations. <remark>[tbd: again, sample
+          operations?]</remark> To connect to Zookeeper with the multithreaded
+          client, for example, you would run:</para>
+
+          <screen>$ cli_mt 127.0.0.1:2181</screen>
+        </listitem>
+      </orderedlist>
+    </section>
+
+    <section id="sc_singleAndDevSetup">
+      <title>Single Server and Developer Setup</title>
+
+      <para>If you want to setup Zookeeper for development purposes, you will
+      probably want to setup a single server instance of Zookeeper, and then
+      install either the Java or C client-side libraries and bindings on your
+      development machine.</para>
+
+      <para>The steps to setting up a single server instance are the similar
+      to the above, except the configuration file is simpler. You can find the
+      complete instructions in the <ulink
+      url="zookeeperStarted.html#sc_InstallingSingleMode">Installing
+      and Running Zookeeper in SIngle Server Mode</ulink> section of the
+      <ulink url="zookeeperStarted.html">Zookeeper
+      Getting Started Guide</ulink>.</para>
+
+      <para>For information on installing the client side libraries, refer to
+      the <ulink
+      url="zookeeperProgrammers.html#Bindings">Bindings</ulink>
+      section of the <ulink
+      url="zookeeperProgrammers.html">Zookeeper
+      Programmer's Guide</ulink>.</para>
+    </section>
+  </chapter>
+
+  <chapter id="ch_administration">
+    <title>Administration</title>
+
+    <para>This chapter contains information about running and maintaining
+    ZooKeeper and covers these topics: <itemizedlist>
+        <listitem>
+          <para><xref linkend="sc_configuration"/></para>
+        </listitem>
+
+        <listitem>
+          <para><xref linkend="sc_zkCommands"/></para>
+        </listitem>
+
+        <listitem>
+          <para><xref linkend="sc_dataFileManagement"/></para>
+        </listitem>
+
+        <listitem>
+          <para><xref linkend="sc_commonProblems"/></para>
+        </listitem>
+
+        <listitem>
+          <para><xref linkend="sc_bestPractices"/></para>
+        </listitem>
+      </itemizedlist></para>
+
+
+      <section id="sc_configuration">
+        <title>Configuration Parameters</title>
+
+        <para>ZooKeeper's behavior is governed by the ZooKeeper configuration
+        file. This file is designed so that the exact same file can be used by
+        all the servers that make up a ZooKeeper server assuming the disk
+        layouts are the same. If servers use different configuration files,
+        care must be taken to ensure that the list of servers in all of the
+        different configuration files match.<remark> [tbd: reformat in
+        standard form, with legal values, etc]</remark></para>
+
+        <section id="sc_minimumConfiguration">
+          <title>Minimum Configuration</title>
+
+          <para>Here are the minimum configuration keywords that must be
+          defined in the configuration file:</para>
+
+          <variablelist>
+
+	    <varlistentry>
+              <term>clientPort</term>
+
+              <listitem>
+                <para>the port to listen for client connections; that is, the
+                port that clients attempt to connect to.</para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>dataDir</term>
+
+              <listitem>
+                <para>the location where Zookeeper will store the in-memory
+                database snapshots and, unless specified otherwise, the
+                transaction log of updates to the database.</para>
+
+                <note>
+                  <para>Be careful where you put the transaction log. A
+                  dedicated transaction log device is key to consistent good
+                  performance. Putting the log on a busy device will adversely
+                  effect performance.</para>
+                </note>
+              </listitem>
+            </varlistentry>
+	    
+	    <varlistentry id="id_tickTime">
+              <term>tickTime</term>
+
+              <listitem>
+                <para>the length of a single tick, which is the basic time
+                unit used by ZooKeeper, as measured in milliseconds. It is
+                used to regulate heartbeats, and timeouts. For example, the
+                minimum session timeout will be two ticks.</para>
+              </listitem>
+            </varlistentry>
+	    
+          </variablelist>
+        </section>
+
+        <section id="sc_advancedConfiguration">
+          <title>Advanced Configuration</title>
+
+          <para>The configuration settings in the section are optional. You
+          can use them to further fine tune the behaviour of your Zookeeper
+          servers. Some can also be set using Java system properties,
+          generally of the form <emphasis>zookeeper.keyword</emphasis>. The
+          exact system property, when available, is noted below.</para>
+
+          <variablelist>
+	  
+            <varlistentry>
+              <term>dataLogDir</term>
+
+              <listitem>
+                <para>(No Java system property)</para>
+
+                <para>This option will direct the machine to write the
+                transaction log to the <emphasis
+                role="bold">dataLogDir</emphasis> rather than the <emphasis
+                role="bold">dataDir</emphasis>. This allows a dedicated log
+                device to be used, and helps avoid competition between logging
+                and snaphots.</para>
+
+                <note>
+                  <para>Having a dedicated log device has a large impact on
+                  throughput and stable latencies. It is highly recommened to
+                  dedicate a log device and set <emphasis
+                  role="bold">dataLogDir</emphasis> to point to a directory on
+                  that device, and then make sure to point <emphasis
+                  role="bold">dataDir</emphasis> to a directory
+                  <emphasis>not</emphasis> residing on that device.</para>
+                </note>
+              </listitem>
+            </varlistentry>
+	    
+	     <varlistentry>
+              <term>globalOutstandingLimit</term>
+
+              <listitem>
+                <para>(Java system property: <emphasis
+                role="bold">zookeeper.globalOutstandingLimit.</emphasis>)</para>
+
+                <para>Clients can submit requests faster than ZooKeeper can
+                process them, especially if there are a lot of clients. To
+                prevent ZooKeeper from running out of memory due to queued
+                requests, ZooKeeper will throttle clients so that there is no
+                more than globalOutstandingLimit outstanding requests in the
+                system. The default limit is 1,000.</para>
+              </listitem>
+            </varlistentry>
+	    
+            <varlistentry>
+              <term>preAllocSize</term>
+
+              <listitem>
+                <para>(Java system property: <emphasis
+                role="bold">zookeeper.preAllocSize</emphasis>)</para>
+
+                <para>To avoid seeks ZooKeeper allocates space in the
+                transaction log file in blocks of preAllocSize kilobytes. The
+                default block size is 64M. One reason for changing the size of
+                the blocks is to reduce the block size if snapshots are taken
+                more often. (Also, see <emphasis
+                role="bold">snapCount</emphasis>).</para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>snapCount</term>
+
+              <listitem>
+                <para>(Java system property: <emphasis
+                role="bold">zookeeper.snapCount</emphasis>)</para>
+
+                <para>Clients can submit requests faster than ZooKeeper can
+                process them, especially if there are a lot of clients. To
+                prevent ZooKeeper from running out of memory due to queued
+                requests, ZooKeeper will throttle clients so that there is no
+                more than globalOutstandingLimit outstanding requests in the
+                system. The default limit is 1,000.ZooKeeper logs transactions
+                to a transaction log. After snapCount transactions are written
+                to a log file a snapshot is started and a new transaction log
+                file is started. The default snapCount is 10,000.</para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>traceFile</term>
+
+              <listitem>
+                <para>(Java system property: <emphasis
+                role="bold">requestTraceFile</emphasis>)</para>
+
+                <para>If this option is defined, requests will be will logged
+                to a trace file named traceFile.year.month.day. Use of this
+                option provides useful debugging information, but will impact
+                performance. (Note: The system property has no zookeeper
+                prefix, and the configuration variable name is different from
+                the system property. Yes - it's not consistent, and it's
+                annoying.<remark> [tbd: is there any explanation for
+                this?]</remark>)</para>
+              </listitem>
+            </varlistentry>
+
+          </variablelist>
+        </section>
+
+        <section id="sc_clusterOptions">
+          <title>Cluster Options</title>
+
+          <para>The options in this section are designed for use in quorums --
+          that is, when deploying clusters of servers.</para>
+
+          <variablelist>
+            <varlistentry>
+              <term>electionAlg:</term>
+
+              <listitem>
+                <para>(No Java system property)</para>
+
+                <para>Election implementation to use. A value of "0"
+                corresponds to the original UDP-based version, "1" corresponds
+                to the non-authenticated UDP-based version of fast leader
+                election, "2" corresponds to the authenticated UDP-based
+                version of fast leader election, and "3" corresponds to
+                TCP-based version of fast leader election</para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>electionPort</term>
+
+              <listitem>
+                <para>(No Java system property)</para>
+
+                <para>Port used for leader election. It is only used when the
+                election algorithm is not "0". When the election algorithm is
+                "0" a UDP port with the same port number as the port listed in
+                the <emphasis role="bold">server.num</emphasis> option will be
+                used. <remark>[tbd: should that be <emphasis
+                role="bold">server.id</emphasis>? Also, why isn't server.id
+                documented anywhere?]</remark></para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>initLimit</term>
+
+              <listitem>
+                <para>(No Java system property)</para>
+
+                <para>Amount of time, in ticks (see <ulink
+                url="#id_tickTime">tickTime</ulink>), to allow followers to
+                connect and sync to a leader. Increased this value as needed,
+                if the amount of data managed by ZooKeeper is large.</para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>leaderServes</term>
+
+              <listitem>
+                <para>(Java system property: zookeeper.<emphasis
+                role="bold">leaderServes</emphasis>)</para>
+
+                <para>Leader accepts client connections. Default value is
+                "yes". The leader machine coordinates updates. For higher
+                update throughput at thes slight expense of read throughput
+                the leader can be configured to not accept clients and focus
+                on coordination. The default to this option is yes, which
+                means that a leader will accept client connections.
+                <remark>[tbd: how do you specifiy which server is the
+                leader?]</remark></para>
+
+                <note>
+                  <para>Turning on leader selection is highly recommended when
+                  you have more than three Zookeeper servers in a
+                  quorum.</para>
+                </note>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>server.x=[hostname]:nnnn, etc</term>
+
+              <listitem>
+                <para>(No Java system property)</para>
+
+                <para>servers making up the Zookeeper quorum. When the server
+                starts up, it determines which server it is by looking for the
+                file <filename>myid</filename> in the data directory.<remark>
+                [tdb: should we mention somewhere about creating this file,
+                myid, in the setup procedure?]</remark> That file contains the
+                server number, in ASCII, and it should match <emphasis
+                role="bold">x</emphasis> in <emphasis
+                role="bold">server.x</emphasis> in the left hand side of this
+                setting.</para>
+
+                <para>The list of servers that make up ZooKeeper servers that
+                is used by the clients must match the list of ZooKeeper
+                servers that each ZooKeeper server has.</para>
+
+                <para>The port numbers <emphasis role="bold">nnnn</emphasis>
+                in this setting are the <emphasis>electionPort</emphasis>
+                numbers of the servers (as opposed to clientPorts).
+                <remark>[tbd: is the next sentence explanation an of what the
+                election port or is it a description of a special case?]
+                </remark>If you want to test multiple servers on a single
+                machine, the individual choices of electionPort for each
+                server can be defined in each server's config files using the
+                line electionPort=xxxx to avoid clashes.</para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>syncLimit</term>
+
+              <listitem>
+                <para>(No Java system property)</para>
+
+                <para>Amount of time, in ticks (see <ulink
+                url="#id_tickTime">tickTime</ulink>), to allow followers to
+                sync with ZooKeeper. If followers fall too far behind a
+                leader, they will be dropped. <remark>[tbd: is this a correct
+                rewording: if followers fall beyond this limit, they are
+                dropped?]</remark></para>
+              </listitem>
+            </varlistentry>
+          </variablelist>
+
+          <para></para>
+        </section>
+
+        <section>
+          <title>Unsafe Options</title>
+
+          <para>The following options can be useful, but be careful when you
+          use them. The risk of each is explained along with the explanation
+          of what the variable does.</para>
+
+          <variablelist>
+	  
+	  <varlistentry>
+              <term>forceSync</term>
+
+              <listitem>
+                <para>(Java system property: <emphasis
+                role="bold">zookeeper.forceSync</emphasis>)</para>
+
+                <para>Requires updates to be synced to media of the
+                transaction log before finishing processing the update. If
+                this option is set to no, ZooKeeper will not require updates
+                to be synced to the media. <remark>[tbd: useful because...,
+                dangerous because...]</remark></para>
+              </listitem>
+            </varlistentry>
+
+            <varlistentry>
+              <term>jute.maxbuffer:</term>
+
+              <listitem>
+                <para>(Java system property:<emphasis role="bold">
+                jute.maxbuffer</emphasis>)</para>
+
+                <para>This option can only be set as a Java system property.
+                There is no zookeeper prefix on it. It specifies the maximum
+                size of the data that can be stored in a znode. The default is
+                0xfffff, or just under 1M. If this option is changed, the
+                system property must be set on all servers and clients
+                otherwise problems will arise. This is really a sanity check.
+                ZooKeeper is designed to store data on the order of kilobytes
+                in size.</para>
+              </listitem>
+            </varlistentry>
+	    
+            <varlistentry>
+              <term>skipACL</term>
+
+              <listitem>
+                <para>(Java system property: <emphasis
+                role="bold">zookeeper.skipACL</emphasis>)</para>
+
+                <para>Skips ACL checks. <remark>[tbd: when? where?]</remark>
+                This results in a boost in throughput, but opens up full
+                access to the data tree to everyone.</para>
+              </listitem>
+            </varlistentry>
+
+            
+          </variablelist>
+        </section>
+      </section>
+
+      <section id="sc_zkCommands">
+        <title>Zookeeper Commands: The Four Letter Words</title>
+
+        <para>Zookeeper responds to a small set of commands. Each command is composed of
+        four letters. You issue the commands to Zookeeper via telnet or nc, at
+        the client port.</para>
+
+        <variablelist>
+	
+	    <varlistentry>
+            <term>dump</term>
+
+            <listitem>
+              <para>Lists the outstanding sessions and ephemeral nodes. This
+              only works on the leader.</para>
+            </listitem>
+          </varlistentry>
+	  
+	    <varlistentry>
+            <term>kill</term>
+
+            <listitem>
+              <para>Shuts down the server. This must be issued from the
+              machine the Zookeeper server is running on.</para>
+            </listitem>
+          </varlistentry>
+	  
+          <varlistentry>
+            <term>ruok</term>
+
+            <listitem>
+              <para>Tests if server is running in a non-error state. The
+              server will respond with imok if it is running. Otherwise it
+              will not respond at all.</para>
+            </listitem>
+          </varlistentry>
+
+          <varlistentry>
+            <term>stat</term>
+
+            <listitem>
+              <para>Lists statistics about performance and connected
+              clients.</para>
+            </listitem>
+          </varlistentry>
+        </variablelist>
+
+        <para>Here's an example of the <emphasis role="bold">ruok</emphasis>
+        command:</para>
+
+        <screen>$ echo ruok | nc 127.0.0.1 5111
+
+imok
+</screen>
+      </section>
+
+      <section id="sc_monitoring">
+        <title>Monitoring</title>
+
+        <para><remark>[tbd: Patrick, Ben, et al: I believe the Message Broker
+        team does perform routine monitoring of Zookeeper. But I might be
+        wrong. To your knowledge, is there any monitoring of a Zookeeper
+        deployment that will a Zookeeper sys admin will want to do, outside of
+        Yahoo?]</remark></para>
+      </section>
+
+    <section id="sc_dataFileManagement">
+      <title>Data File Management</title>
+
+      <para>ZooKeeper stores its data in a data directory and its transaction
+      log in a transaction log directory. By default these two directories are
+      the same. The server can (and should) be configured to store the
+      transaction log files in a separate directory than the data files.
+      Throughput increases and latency decreases when transaction logs reside
+      on a dedicated log devices.</para>
+
+      <section>
+        <title>The Data Directory</title>
+
+        <para>This directory has two files in it:</para>
+
+        <itemizedlist>
+          <listitem>
+            <para><filename>myid</filename> - contains a single integer in
+            human readable ASCII text that represents the server id.</para>
+          </listitem>
+
+          <listitem>
+            <para><filename>snapshot.&lt;zxid&gt;</filename> - holds the fuzzy
+            snapshot of a data tree.</para>
+          </listitem>
+        </itemizedlist>
+
+        <para>Each ZooKeeper server has a unique id. This id is used in two
+        places: the <filename>myid</filename> file and the configuration file.
+        The <filename>myid</filename> file identifies the server that
+        corresponds to the given data directory. The configuration file lists
+        the contact information for each server identified by its server id.
+        When a ZooKeeper server instance starts, it reads its id from the
+        <filename>myid</filename> file and then, using that id, reads from the
+        configuration file, looking up the port on which it should
+        listen.</para>
+
+        <para>The <filename>snapshot</filename> files stored in the data
+        directory are fuzzy snapshots in the sense that during the time the
+        ZooKeeper server is taking the snapshot, updates are occurring to the
+        data tree. The suffix of the <filename>snapshot</filename> file names
+        is the <emphasis>zxid</emphasis>, the ZooKeeper transaction id, of the
+        last committed transaction at the start of the snapshot. Thus, the
+        snapshot includes a subset of the updates to the data tree that
+        occurred while the snapshot was in process. The snapshot, then, may
+        not correspond to any data tree that actually existed, and for this
+        reason we refer to it as a fuzzy snapshot. Still, ZooKeeper can
+        recover using this snapshot because it takes advantage of the
+        idempotent nature of its updates. By replaying the transaction log
+        against fuzzy snapshots ZooKeeper gets the state of the system at the
+        end of the log.</para>
+      </section>
+
+      <section>
+        <title>The Log Directory</title>
+
+        <para>The Log Directory contains the ZooKeeper transaction logs.
+        Before any update takes place, ZooKeeper ensures that the transaction
+        that represents the update is written to non-volatile storage. A new
+        log file is started each time a snapshot is begun. The log file's
+        suffix is the first zxid written to that log.</para>
+      </section>
+
+      <section>
+        <title>File Management</title>
+
+        <para>The format of snapshot and log files does not change between
+        standalone ZooKeeper servers and different configurations of
+        replicated ZooKeeper servers. Therefore, you can pull these files from
+        a running replicated ZooKeeper server to a development machine with a
+        stand-alone ZooKeeper server for trouble shooting.</para>
+
+        <para>Using older log and snapshot files, you can look at the previous
+        state of ZooKeeper servers and even restore that state. The
+        LogFormatter class allows an administrator to look at the transactions
+        in a log.</para>
+
+        <para>The ZooKeeper server creates snapshot and log files, but never
+        deletes them. The retention policy of the data and log files is
+        implemented outside of the ZooKeeper server. The server itself only
+        needs the latest complete fuzzy snapshot and the log files from the
+        start of that snapshot. The PurgeTxnLog utility implements a simple
+        retention policy that administrators can use.</para>
+      </section>
+    </section>
+
+    <section id="sc_commonProblems">
+      <title>Things to Avoid</title>
+
+      <para>Here are some common problems you can avoid by configuring
+      ZooKeeper correctly:</para>
+
+      <variablelist>
+        <varlistentry>
+          <term>inconsistent lists of servers</term>
+
+          <listitem>
+            <para>The list of Zookeeper servers used by the clients must match
+            the list of ZooKeeper servers that each ZooKeeper server has.
+            Things work okay if the client list is a subset of the real list,
+            but things will really act strange if clients have a list of
+            ZooKeeper servers that are in different ZooKeeper clusters. Also,
+            the server lists in each Zookeeper server configuration file
+            should be consistent with one another. <remark>[tbd: I'm assuming
+            this last part is true. Is it?]</remark></para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term>incorrect placement of transasction log</term>
+
+          <listitem>
+            <para>The most performance critical part of ZooKeeper is the
+            transaction log. Zookeeper syncs transactions to media before it
+            returns a response. A dedicated transaction log device is key to
+            consistent good performance. Putting the log on a busy device will
+            adversely effect performance. If you only have one storage device,
+            put trace files on NFS and increase the snapshotCount; it doesn't
+            eliminate the problem, but it should mitigate it.</para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term>incorrect Java heap size</term>
+
+          <listitem>
+            <para>You should take special care to set your Java max heap size
+            correctly. In particular, you should not create a situation in
+            which Zookeeper swaps to disk. The disk is death to ZooKeeper.
+            Everything is ordered, so if processing one request swaps the
+            disk, all other queued requests will probably do the same. the
+            disk. DON'T SWAP.</para>
+
+            <para>Be conservative in your estimates: if you have 4G of RAM, do
+            not set the Java max heap size to 6G or even 4G. For example, it
+            is more likely you would use a 3G heap for a 4G machine, as the
+            operating system and the cache also need memory. The best and only
+            recommend practice for estimating the heap size your system needs
+            is to run load tests, and then make sure you are well below the
+            usage limit that would cause the system to swap.</para>
+          </listitem>
+        </varlistentry>
+      </variablelist>
+    </section>
+
+    <section id="sc_bestPractices">
+      <title>Best Practices</title>
+
+      <para>For best results, take note of the following list of good
+      Zookeeper practices. <remark>[tbd: I just threw this section in. Do we
+      have list that is is different from the "things to avoid"? If not, I can
+      easily remove this section.]</remark></para>
+    </section>
+  </chapter>
+</book>

Propchange: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperOtherInfo.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperOtherInfo.xml?rev=698787&view=auto
==============================================================================
--- hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperOtherInfo.xml (added)
+++ hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperOtherInfo.xml Wed Sep 24 18:05:14 2008
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright 2002-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN"
+"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<book id="bk_OtherInfo">
+  <title>ZooKeeper</title>
+
+  <bookinfo>
+    <legalnotice>
+      <para>Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License. You may
+      obtain a copy of the License at <ulink
+      url="http://www.apache.org/licenses/LICENSE-2.0">http://www.apache.org/licenses/LICENSE-2.0</ulink>.</para>
+
+      <para>Unless required by applicable law or agreed to in writing,
+      software distributed under the License is distributed on an "AS IS"
+      BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied. See the License for the specific language governing permissions
+      and limitations under the License.</para>
+    </legalnotice>
+
+    <abstract>
+      <para> currently empty </para>
+    </abstract>
+  </bookinfo>
+
+  <chapter id="ch_placeholder">
+    <title>Other Info</title>
+    <para> currently empty </para>
+  </chapter>
+</book>

Propchange: hadoop/zookeeper/trunk/src/docs/src/documentation/content/xdocs/zookeeperOtherInfo.xml
------------------------------------------------------------------------------
    svn:eol-style = native