You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by ph...@apache.org on 2017/11/09 18:14:00 UTC
svn commit: r1814761 [37/45] - in /zookeeper/site/trunk: content/
content/doc/r3.4.11/ content/doc/r3.4.11/api/ content/doc/r3.4.11/api/org/
content/doc/r3.4.11/api/org/apache/ content/doc/r3.4.11/api/org/apache/jute/
content/doc/r3.4.11/api/org/apache...
Added: zookeeper/site/trunk/content/doc/r3.4.11/recipes.html
URL: http://svn.apache.org/viewvc/zookeeper/site/trunk/content/doc/r3.4.11/recipes.html?rev=1814761&view=auto
==============================================================================
--- zookeeper/site/trunk/content/doc/r3.4.11/recipes.html (added)
+++ zookeeper/site/trunk/content/doc/r3.4.11/recipes.html Thu Nov 9 18:13:55 2017
@@ -0,0 +1,974 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.9">
+<meta name="Forrest-skin-name" content="pelt">
+<title>ZooKeeper Recipes and Solutions</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+ |breadtrail
+ +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a> > <a href="http://zookeeper.apache.org/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+ |header
+ +-->
+<div class="header">
+<!--+
+ |start group logo
+ +-->
+<div class="grouplogo">
+<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
+</div>
+<!--+
+ |end group logo
+ +-->
+<!--+
+ |start Project Logo
+ +-->
+<div class="projectlogo">
+<a href="http://zookeeper.apache.org/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a>
+</div>
+<!--+
+ |end Project Logo
+ +-->
+<!--+
+ |start Search
+ +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="zookeeper.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">
+ <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+ |end search
+ +-->
+<!--+
+ |start Tabs
+ +-->
+<ul id="tabs">
+<li>
+<a class="unselected" href="http://zookeeper.apache.org/">Project</a>
+</li>
+<li>
+<a class="unselected" href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/">Wiki</a>
+</li>
+<li class="current">
+<a class="selected" href="index.html">ZooKeeper 3.4 Documentation</a>
+</li>
+</ul>
+<!--+
+ |end Tabs
+ +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+ |start Subtabs
+ +-->
+<div id="level2tabs"></div>
+<!--+
+ |end Endtabs
+ +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+// --></script>
+</div>
+<!--+
+ |breadtrail
+ +-->
+<div class="breadtrail">
+
+
+ </div>
+<!--+
+ |start Menu, mainarea
+ +-->
+<!--+
+ |start Menu
+ +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div>
+<div id="menu_1.1" class="menuitemgroup">
+<div class="menuitem">
+<a href="index.html">Welcome</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperOver.html">Overview</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperStarted.html">Getting Started</a>
+</div>
+<div class="menuitem">
+<a href="releasenotes.html">Release Notes</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_selected_1.2', 'skin/')" id="menu_selected_1.2Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Developer</div>
+<div id="menu_selected_1.2" class="selectedmenuitemgroup" style="display: block;">
+<div class="menuitem">
+<a href="api/index.html">API Docs</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperProgrammers.html">Programmer's Guide</a>
+</div>
+<div class="menuitem">
+<a href="javaExample.html">Java Example</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a>
+</div>
+<div class="menupage">
+<div class="menupagetitle">Recipes</div>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">BookKeeper</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="bookkeeperStarted.html">Getting started</a>
+</div>
+<div class="menuitem">
+<a href="bookkeeperOverview.html">Overview</a>
+</div>
+<div class="menuitem">
+<a href="bookkeeperConfig.html">Setup guide</a>
+</div>
+<div class="menuitem">
+<a href="bookkeeperProgrammer.html">Programmer's guide</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Admin & Ops</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="zookeeperAdmin.html">Administrator's Guide</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperQuotas.html">Quota Guide</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperJMX.html">JMX</a>
+</div>
+<div class="menuitem">
+<a href="zookeeperObservers.html">Observers Guide</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Contributor</div>
+<div id="menu_1.5" class="menuitemgroup">
+<div class="menuitem">
+<a href="zookeeperInternals.html">ZooKeeper Internals</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.6', 'skin/')" id="menu_1.6Title" class="menutitle">Miscellaneous</div>
+<div id="menu_1.6" class="menuitemgroup">
+<div class="menuitem">
+<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="https://cwiki.apache.org/confluence/display/ZOOKEEPER/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://zookeeper.apache.org/mailing_lists.html">Mailing Lists</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+ |alternative credits
+ +-->
+<div id="credit2"></div>
+</div>
+<!--+
+ |end Menu
+ +-->
+<!--+
+ |start content
+ +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="recipes.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+ PDF</a>
+</div>
+<h1>ZooKeeper Recipes and Solutions</h1>
+<div id="front-matter">
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#ch_recipes">A Guide to Creating Higher-level Constructs with ZooKeeper</a>
+<ul class="minitoc">
+<li>
+<a href="#sc_outOfTheBox">Out of the Box Applications: Name Service, Configuration, Group
+ Membership</a>
+</li>
+<li>
+<a href="#sc_recipes_eventHandles">Barriers</a>
+<ul class="minitoc">
+<li>
+<a href="#sc_doubleBarriers">Double Barriers</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#sc_recipes_Queues">Queues</a>
+<ul class="minitoc">
+<li>
+<a href="#sc_recipes_priorityQueues">Priority Queues</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#sc_recipes_Locks">Locks</a>
+<ul class="minitoc">
+<li>
+<a href="#Shared+Locks">Shared Locks</a>
+</li>
+<li>
+<a href="#sc_recoverableSharedLocks">Recoverable Shared Locks</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#sc_recipes_twoPhasedCommit">Two-phased Commit</a>
+</li>
+<li>
+<a href="#sc_leaderElection">Leader Election</a>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+</div>
+
+
+
+
+
+<a name="ch_recipes"></a>
+<h2 class="h3">A Guide to Creating Higher-level Constructs with ZooKeeper</h2>
+<div class="section">
+<p>In this article, you'll find guidelines for using
+ ZooKeeper to implement higher order functions. All of them are conventions
+ implemented at the client and do not require special support from
+ ZooKeeper. Hopfully the community will capture these conventions in client-side libraries
+ to ease their use and to encourage standardization.</p>
+<p>One of the most interesting things about ZooKeeper is that even
+ though ZooKeeper uses <em>asynchronous</em> notifications, you
+ can use it to build <em>synchronous</em> consistency
+ primitives, such as queues and locks. As you will see, this is possible
+ because ZooKeeper imposes an overall order on updates, and has mechanisms
+ to expose this ordering.</p>
+<p>Note that the recipes below attempt to employ best practices. In
+ particular, they avoid polling, timers or anything else that would result
+ in a "herd effect", causing bursts of traffic and limiting
+ scalability.</p>
+<p>There are many useful functions that can be imagined that aren't
+ included here - revocable read-write priority locks, as just one example.
+ And some of the constructs mentioned here - locks, in particular -
+ illustrate certain points, even though you may find other constructs, such
+ as event handles or queues, a more practical means of performing the same
+ function. In general, the examples in this section are designed to
+ stimulate thought.</p>
+<a name="sc_outOfTheBox"></a>
+<h3 class="h4">Out of the Box Applications: Name Service, Configuration, Group
+ Membership</h3>
+<p>Name service and configuration are two of the primary applications
+ of ZooKeeper. These two functions are provided directly by the ZooKeeper
+ API.</p>
+<p>Another function directly provided by ZooKeeper is <em>group
+ membership</em>. The group is represented by a node. Members of the
+ group create ephemeral nodes under the group node. Nodes of the members
+ that fail abnormally will be removed automatically when ZooKeeper detects
+ the failure.</p>
+<a name="sc_recipes_eventHandles"></a>
+<h3 class="h4">Barriers</h3>
+<p>Distributed systems use <em>barriers</em>
+ to block processing of a set of nodes until a condition is met
+ at which time all the nodes are allowed to proceed. Barriers are
+ implemented in ZooKeeper by designating a barrier node. The
+ barrier is in place if the barrier node exists. Here's the
+ pseudo code:</p>
+<ol>
+
+<li>
+
+<p>Client calls the ZooKeeper API's <strong>exists()</strong> function on the barrier node, with
+ <em>watch</em> set to true.</p>
+
+</li>
+
+
+<li>
+
+<p>If <strong>exists()</strong> returns false, the
+ barrier is gone and the client proceeds</p>
+
+</li>
+
+
+<li>
+
+<p>Else, if <strong>exists()</strong> returns true,
+ the clients wait for a watch event from ZooKeeper for the barrier
+ node.</p>
+
+</li>
+
+
+<li>
+
+<p>When the watch event is triggered, the client reissues the
+ <strong>exists( )</strong> call, again waiting until
+ the barrier node is removed.</p>
+
+</li>
+
+</ol>
+<a name="sc_doubleBarriers"></a>
+<h4>Double Barriers</h4>
+<p>Double barriers enable clients to synchronize the beginning and
+ the end of a computation. When enough processes have joined the barrier,
+ processes start their computation and leave the barrier once they have
+ finished. This recipe shows how to use a ZooKeeper node as a
+ barrier.</p>
+<p>The pseudo code in this recipe represents the barrier node as
+ <em>b</em>. Every client process <em>p</em>
+ registers with the barrier node on entry and unregisters when it is
+ ready to leave. A node registers with the barrier node via the <strong>Enter</strong> procedure below, it waits until
+ <em>x</em> client process register before proceeding with
+ the computation. (The <em>x</em> here is up to you to
+ determine for your system.)</p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+
+
+<tr>
+
+<td><strong>Enter</strong></td>
+
+ <td><strong>Leave</strong></td>
+
+</tr>
+
+
+<tr>
+
+<td>
+<ol>
+
+<li>
+
+<p>Create a name <em><em>n</em> =
+ <em>b</em>+“/”+<em>p</em></em>
+</p>
+
+</li>
+
+
+<li>
+
+<p>Set watch: <strong>exists(<em>b</em> + ‘‘/ready’’,
+ true)</strong>
+</p>
+
+</li>
+
+
+<li>
+
+<p>Create child: <strong>create(
+ <em>n</em>, EPHEMERAL)</strong>
+</p>
+
+</li>
+
+
+<li>
+
+<p>
+<strong>L = getChildren(b,
+ false)</strong>
+</p>
+
+</li>
+
+
+<li>
+
+<p>if fewer children in L than<em>
+ x</em>, wait for watch event</p>
+
+</li>
+
+
+<li>
+
+<p>else <strong>create(b + ‘‘/ready’’,
+ REGULAR)</strong>
+</p>
+
+</li>
+
+</ol>
+</td>
+
+ <td>
+<ol>
+
+<li>
+
+<p>
+<strong>L = getChildren(b,
+ false)</strong>
+</p>
+
+</li>
+
+
+<li>
+
+<p>if no children, exit</p>
+
+</li>
+
+
+<li>
+
+<p>if <em>p</em> is only process node in
+ L, delete(n) and exit</p>
+
+</li>
+
+
+<li>
+
+<p>if <em>p</em> is the lowest process
+ node in L, wait on highest process node in L</p>
+
+</li>
+
+
+<li>
+
+<p>else <strong>delete(<em>n</em>) </strong>if
+ still exists and wait on lowest process node in L</p>
+
+</li>
+
+
+<li>
+
+<p>goto 1</p>
+
+</li>
+
+</ol>
+</td>
+
+</tr>
+
+
+</table>
+<p>On entering, all processes watch on a ready node and
+ create an ephemeral node as a child of the barrier node. Each process
+ but the last enters the barrier and waits for the ready node to appear
+ at line 5. The process that creates the xth node, the last process, will
+ see x nodes in the list of children and create the ready node, waking up
+ the other processes. Note that waiting processes wake up only when it is
+ time to exit, so waiting is efficient.
+ </p>
+<p>On exit, you can't use a flag such as <em>ready</em>
+ because you are watching for process nodes to go away. By using
+ ephemeral nodes, processes that fail after the barrier has been entered
+ do not prevent correct processes from finishing. When processes are
+ ready to leave, they need to delete their process nodes and wait for all
+ other processes to do the same.</p>
+<p>Processes exit when there are no process nodes left as children of
+ <em>b</em>. However, as an efficiency, you can use the
+ lowest process node as the ready flag. All other processes that are
+ ready to exit watch for the lowest existing process node to go away, and
+ the owner of the lowest process watches for any other process node
+ (picking the highest for simplicity) to go away. This means that only a
+ single process wakes up on each node deletion except for the last node,
+ which wakes up everyone when it is removed.</p>
+<a name="sc_recipes_Queues"></a>
+<h3 class="h4">Queues</h3>
+<p>Distributed queues are a common data structure. To implement a
+ distributed queue in ZooKeeper, first designate a znode to hold the queue,
+ the queue node. The distributed clients put something into the queue by
+ calling create() with a pathname ending in "queue-", with the
+ <em>sequence</em> and <em>ephemeral</em> flags in
+ the create() call set to true. Because the <em>sequence</em>
+ flag is set, the new pathnames will have the form
+ _path-to-queue-node_/queue-X, where X is a monotonic increasing number. A
+ client that wants to be removed from the queue calls ZooKeeper's <strong>getChildren( )</strong> function, with
+ <em>watch</em> set to true on the queue node, and begins
+ processing nodes with the lowest number. The client does not need to issue
+ another <strong>getChildren( )</strong> until it exhausts
+ the list obtained from the first <strong>getChildren(
+ )</strong> call. If there are are no children in the queue node, the
+ reader waits for a watch notification to check the queue again.</p>
+<div class="note">
+<div class="label">Note</div>
+<div class="content">
+
+<p>There now exists a Queue implementation in ZooKeeper
+ recipes directory. This is distributed with the release --
+ src/recipes/queue directory of the release artifact.
+ </p>
+
+</div>
+</div>
+<a name="sc_recipes_priorityQueues"></a>
+<h4>Priority Queues</h4>
+<p>To implement a priority queue, you need only make two simple
+ changes to the generic <a href="#sc_recipes_Queues">queue
+ recipe</a> . First, to add to a queue, the pathname ends with
+ "queue-YY" where YY is the priority of the element with lower numbers
+ representing higher priority (just like UNIX). Second, when removing
+ from the queue, a client uses an up-to-date children list meaning that
+ the client will invalidate previously obtained children lists if a watch
+ notification triggers for the queue node.</p>
+<a name="sc_recipes_Locks"></a>
+<h3 class="h4">Locks</h3>
+<p>Fully distributed locks that are globally synchronous, meaning at
+ any snapshot in time no two clients think they hold the same lock. These
+ can be implemented using ZooKeeeper. As with priority queues, first define
+ a lock node.</p>
+<div class="note">
+<div class="label">Note</div>
+<div class="content">
+
+<p>There now exists a Lock implementation in ZooKeeper
+ recipes directory. This is distributed with the release --
+ src/recipes/lock directory of the release artifact.
+ </p>
+
+</div>
+</div>
+<p>Clients wishing to obtain a lock do the following:</p>
+<ol>
+
+<li>
+
+<p>Call <strong>create( )</strong> with a pathname
+ of "_locknode_/lock-" and the <em>sequence</em> and
+ <em>ephemeral</em> flags set.</p>
+
+</li>
+
+
+<li>
+
+<p>Call <strong>getChildren( )</strong> on the lock
+ node <em>without</em> setting the watch flag (this is
+ important to avoid the herd effect).</p>
+
+</li>
+
+
+<li>
+
+<p>If the pathname created in step <strong>1</strong> has the lowest sequence number suffix, the
+ client has the lock and the client exits the protocol.</p>
+
+</li>
+
+
+<li>
+
+<p>The client calls <strong>exists( )</strong> with
+ the watch flag set on the path in the lock directory with the next
+ lowest sequence number.</p>
+
+</li>
+
+
+<li>
+
+<p>if <strong>exists( )</strong> returns false, go
+ to step <strong>2</strong>. Otherwise, wait for a
+ notification for the pathname from the previous step before going to
+ step <strong>2</strong>.</p>
+
+</li>
+
+</ol>
+<p>The unlock protocol is very simple: clients wishing to release a
+ lock simply delete the node they created in step 1.</p>
+<p>Here are a few things to notice:</p>
+<ul>
+
+<li>
+
+<p>The removal of a node will only cause one client to wake up
+ since each node is watched by exactly one client. In this way, you
+ avoid the herd effect.</p>
+
+</li>
+
+</ul>
+<ul>
+
+<li>
+
+<p>There is no polling or timeouts.</p>
+
+</li>
+
+</ul>
+<ul>
+
+<li>
+
+<p>Because of the way you implement locking, it is easy to see the
+ amount of lock contention, break locks, debug locking problems,
+ etc.</p>
+
+</li>
+
+</ul>
+<a name="Shared+Locks"></a>
+<h4>Shared Locks</h4>
+<p>You can implement shared locks by with a few changes to the lock
+ protocol:</p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+
+
+<tr>
+
+<td><strong>Obtaining a read
+ lock:</strong></td>
+
+ <td><strong>Obtaining a write
+ lock:</strong></td>
+
+</tr>
+
+
+<tr>
+
+<td>
+<ol>
+
+<li>
+
+<p>Call <strong>create( )</strong> to
+ create a node with pathname
+ "<span class="codefrag filename">_locknode_/read-</span>". This is the
+ lock node use later in the protocol. Make sure to set both
+ the <em>sequence</em> and
+ <em>ephemeral</em> flags.</p>
+
+</li>
+
+
+<li>
+
+<p>Call <strong>getChildren( )</strong>
+ on the lock node <em>without</em> setting the
+ <em>watch</em> flag - this is important, as it
+ avoids the herd effect.</p>
+
+</li>
+
+
+<li>
+
+<p>If there are no children with a pathname starting
+ with "<span class="codefrag filename">write-</span>" and having a lower
+ sequence number than the node created in step <strong>1</strong>, the client has the lock and can
+ exit the protocol. </p>
+
+</li>
+
+
+<li>
+
+<p>Otherwise, call <strong>exists(
+ )</strong>, with <em>watch</em> flag, set on
+ the node in lock directory with pathname staring with
+ "<span class="codefrag filename">write-</span>" having the next lowest
+ sequence number.</p>
+
+</li>
+
+
+<li>
+
+<p>If <strong>exists( )</strong>
+ returns <em>false</em>, goto step <strong>2</strong>.</p>
+
+</li>
+
+
+<li>
+
+<p>Otherwise, wait for a notification for the pathname
+ from the previous step before going to step <strong>2</strong>
+</p>
+
+</li>
+
+</ol>
+</td>
+
+ <td>
+<ol>
+
+<li>
+
+<p>Call <strong>create( )</strong> to
+ create a node with pathname
+ "<span class="codefrag filename">_locknode_/write-</span>". This is the
+ lock node spoken of later in the protocol. Make sure to
+ set both <em>sequence</em> and
+ <em>ephemeral</em> flags.</p>
+
+</li>
+
+
+<li>
+
+<p>Call <strong>getChildren( )
+ </strong> on the lock node <em>without</em>
+ setting the <em>watch</em> flag - this is
+ important, as it avoids the herd effect.</p>
+
+</li>
+
+
+<li>
+
+<p>If there are no children with a lower sequence
+ number than the node created in step <strong>1</strong>, the client has the lock and the
+ client exits the protocol.</p>
+
+</li>
+
+
+<li>
+
+<p>Call <strong>exists( ),</strong>
+ with <em>watch</em> flag set, on the node with
+ the pathname that has the next lowest sequence
+ number.</p>
+
+</li>
+
+
+<li>
+
+<p>If <strong>exists( )</strong>
+ returns <em>false</em>, goto step <strong>2</strong>. Otherwise, wait for a
+ notification for the pathname from the previous step
+ before going to step <strong>2</strong>.</p>
+
+</li>
+
+</ol>
+</td>
+
+</tr>
+
+
+</table>
+<div class="note">
+<div class="label">Note</div>
+<div class="content">
+
+<p>It might appear that this recipe creates a herd effect:
+ when there is a large group of clients waiting for a read
+ lock, and all getting notified more or less simultaneously
+ when the "<span class="codefrag filename">write-</span>" node with the lowest
+ sequence number is deleted. In fact. that's valid behavior:
+ as all those waiting reader clients should be released since
+ they have the lock. The herd effect refers to releasing a
+ "herd" when in fact only a single or a small number of
+ machines can proceed.
+ </p>
+
+</div>
+</div>
+<a name="sc_recoverableSharedLocks"></a>
+<h4>Recoverable Shared Locks</h4>
+<p>With minor modifications to the Shared Lock protocol, you make
+ shared locks revocable by modifying the shared lock protocol:</p>
+<p>In step <strong>1</strong>, of both obtain reader
+ and writer lock protocols, call <strong>getData(
+ )</strong> with <em>watch</em> set, immediately after the
+ call to <strong>create( )</strong>. If the client
+ subsequently receives notification for the node it created in step
+ <strong>1</strong>, it does another <strong>getData( )</strong> on that node, with
+ <em>watch</em> set and looks for the string "unlock", which
+ signals to the client that it must release the lock. This is because,
+ according to this shared lock protocol, you can request the client with
+ the lock give up the lock by calling <strong>setData()
+ </strong> on the lock node, writing "unlock" to that node.</p>
+<p>Note that this protocol requires the lock holder to consent to
+ releasing the lock. Such consent is important, especially if the lock
+ holder needs to do some processing before releasing the lock. Of course
+ you can always implement <em>Revocable Shared Locks with Freaking
+ Laser Beams</em> by stipulating in your protocol that the revoker
+ is allowed to delete the lock node if after some length of time the lock
+ isn't deleted by the lock holder.</p>
+<a name="sc_recipes_twoPhasedCommit"></a>
+<h3 class="h4">Two-phased Commit</h3>
+<p>A two-phase commit protocol is an algorithm that lets all clients in
+ a distributed system agree either to commit a transaction or abort.</p>
+<p>In ZooKeeper, you can implement a two-phased commit by having a
+ coordinator create a transaction node, say "/app/Tx", and one child node
+ per participating site, say "/app/Tx/s_i". When coordinator creates the
+ child node, it leaves the content undefined. Once each site involved in
+ the transaction receives the transaction from the coordinator, the site
+ reads each child node and sets a watch. Each site then processes the query
+ and votes "commit" or "abort" by writing to its respective node. Once the
+ write completes, the other sites are notified, and as soon as all sites
+ have all votes, they can decide either "abort" or "commit". Note that a
+ node can decide "abort" earlier if some site votes for "abort".</p>
+<p>An interesting aspect of this implementation is that the only role
+ of the coordinator is to decide upon the group of sites, to create the
+ ZooKeeper nodes, and to propagate the transaction to the corresponding
+ sites. In fact, even propagating the transaction can be done through
+ ZooKeeper by writing it in the transaction node.</p>
+<p>There are two important drawbacks of the approach described above.
+ One is the message complexity, which is O(n²). The second is the
+ impossibility of detecting failures of sites through ephemeral nodes. To
+ detect the failure of a site using ephemeral nodes, it is necessary that
+ the site create the node.</p>
+<p>To solve the first problem, you can have only the coordinator
+ notified of changes to the transaction nodes, and then notify the sites
+ once coordinator reaches a decision. Note that this approach is scalable,
+ but it's is slower too, as it requires all communication to go through the
+ coordinator.</p>
+<p>To address the second problem, you can have the coordinator
+ propagate the transaction to the sites, and have each site creating its
+ own ephemeral node.</p>
+<a name="sc_leaderElection"></a>
+<h3 class="h4">Leader Election</h3>
+<p>A simple way of doing leader election with ZooKeeper is to use the
+ <strong>SEQUENCE|EPHEMERAL</strong> flags when creating
+ znodes that represent "proposals" of clients. The idea is to have a znode,
+ say "/election", such that each znode creates a child znode "/election/n_"
+ with both flags SEQUENCE|EPHEMERAL. With the sequence flag, ZooKeeper
+ automatically appends a sequence number that is greater that any one
+ previously appended to a child of "/election". The process that created
+ the znode with the smallest appended sequence number is the leader.
+ </p>
+<p>That's not all, though. It is important to watch for failures of the
+ leader, so that a new client arises as the new leader in the case the
+ current leader fails. A trivial solution is to have all application
+ processes watching upon the current smallest znode, and checking if they
+ are the new leader when the smallest znode goes away (note that the
+ smallest znode will go away if the leader fails because the node is
+ ephemeral). But this causes a herd effect: upon of failure of the current
+ leader, all other processes receive a notification, and execute
+ getChildren on "/election" to obtain the current list of children of
+ "/election". If the number of clients is large, it causes a spike on the
+ number of operations that ZooKeeper servers have to process. To avoid the
+ herd effect, it is sufficient to watch for the next znode down on the
+ sequence of znodes. If a client receives a notification that the znode it
+ is watching is gone, then it becomes the new leader in the case that there
+ is no smaller znode. Note that this avoids the herd effect by not having
+ all clients watching the same znode. </p>
+<p>Here's the pseudo code:</p>
+<p>Let ELECTION be a path of choice of the application. To volunteer to
+ be a leader: </p>
+<ol>
+
+<li>
+
+<p>Create znode z with path "ELECTION/n_" with both SEQUENCE and
+ EPHEMERAL flags;</p>
+
+</li>
+
+
+<li>
+
+<p>Let C be the children of "ELECTION", and i be the sequence
+ number of z;</p>
+
+</li>
+
+
+<li>
+
+<p>Watch for changes on "ELECTION/n_j", where j is the largest
+ sequence number such that j < i and n_j is a znode in C;</p>
+
+</li>
+
+</ol>
+<p>Upon receiving a notification of znode deletion: </p>
+<ol>
+
+<li>
+
+<p>Let C be the new set of children of ELECTION; </p>
+
+</li>
+
+
+<li>
+
+<p>If z is the smallest node in C, then execute leader
+ procedure;</p>
+
+</li>
+
+
+<li>
+
+<p>Otherwise, watch for changes on "ELECTION/n_j", where j is the
+ largest sequence number such that j < i and n_j is a znode in C;
+ </p>
+
+</li>
+
+</ol>
+<p>Note that the znode having no preceding znode on the list of
+ children does not imply that the creator of this znode is aware that it is
+ the current leader. Applications may consider creating a separate znode
+ to acknowledge that the leader has executed the leader procedure. </p>
+</div>
+
+<p align="right">
+<font size="-2"></font>
+</p>
+</div>
+<!--+
+ |end content
+ +-->
+<div class="clearboth"> </div>
+</div>
+<div id="footer">
+<!--+
+ |start bottomstrip
+ +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+// --></script>
+</div>
+<div class="copyright">
+ Copyright ©
+ <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+ |end bottomstrip
+ +-->
+</div>
+</body>
+</html>
Added: zookeeper/site/trunk/content/doc/r3.4.11/recipes.pdf
URL: http://svn.apache.org/viewvc/zookeeper/site/trunk/content/doc/r3.4.11/recipes.pdf?rev=1814761&view=auto
==============================================================================
Binary file - no diff available.
Propchange: zookeeper/site/trunk/content/doc/r3.4.11/recipes.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream