You are viewing a plain text version of this content. The canonical link for it is here.
Posted to derby-commits@db.apache.org by jt...@apache.org on 2005/04/12 00:22:09 UTC

svn commit: r160972 - in incubator/derby/site/trunk: build/site/ build/site/papers/ build/site/releases/ src/documentation/content/xdocs/ src/documentation/content/xdocs/papers/

Author: jta
Date: Mon Apr 11 15:22:05 2005
New Revision: 160972

URL: http://svn.apache.org/viewcvs?view=rev&rev=160972
Log:
Committed optimizer writeup by Dibyendu Majumdar <di...@mazumdar.demon.co.uk>

Added:
    incubator/derby/site/trunk/build/site/papers/optimizer.html
    incubator/derby/site/trunk/src/documentation/content/xdocs/papers/optimizer.xml
Modified:
    incubator/derby/site/trunk/build/site/linkmap.html
    incubator/derby/site/trunk/build/site/papers/ApacheConUs04.html
    incubator/derby/site/trunk/build/site/papers/JDBCImplementation.html
    incubator/derby/site/trunk/build/site/papers/MiscPresentations.html
    incubator/derby/site/trunk/build/site/papers/derby_arch.html
    incubator/derby/site/trunk/build/site/papers/derby_htw.html
    incubator/derby/site/trunk/build/site/papers/derby_web.html
    incubator/derby/site/trunk/build/site/papers/fortune_tut.html
    incubator/derby/site/trunk/build/site/papers/index.html
    incubator/derby/site/trunk/build/site/papers/logformats.html
    incubator/derby/site/trunk/build/site/papers/pageformats.html
    incubator/derby/site/trunk/build/site/papers/recovery.html
    incubator/derby/site/trunk/build/site/releases/release-10.0.2.1.html
    incubator/derby/site/trunk/src/documentation/content/xdocs/papers/index.xml
    incubator/derby/site/trunk/src/documentation/content/xdocs/site.xml

Modified: incubator/derby/site/trunk/build/site/linkmap.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/linkmap.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/linkmap.html (original)
+++ incubator/derby/site/trunk/build/site/linkmap.html Mon Apr 11 15:22:05 2005
@@ -177,6 +177,9 @@
 <a title="" href="papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>
@@ -435,6 +438,10 @@
       
 <li>
 <a href="papers/recovery.html">Logging &amp; Recovery</a>&nbsp;&nbsp;&nbsp;_________________________&nbsp;&nbsp;<em>recover</em>
+</li>
+      
+<li>
+<a href="papers/optimizer.html">Optimizer</a>&nbsp;&nbsp;&nbsp;_________________________&nbsp;&nbsp;<em>optimizer</em>
 </li>
       
 <li>

Modified: incubator/derby/site/trunk/build/site/papers/ApacheConUs04.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/ApacheConUs04.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/ApacheConUs04.html (original)
+++ incubator/derby/site/trunk/build/site/papers/ApacheConUs04.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/JDBCImplementation.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/JDBCImplementation.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/JDBCImplementation.html (original)
+++ incubator/derby/site/trunk/build/site/papers/JDBCImplementation.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/MiscPresentations.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/MiscPresentations.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/MiscPresentations.html (original)
+++ incubator/derby/site/trunk/build/site/papers/MiscPresentations.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/derby_arch.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/derby_arch.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/derby_arch.html (original)
+++ incubator/derby/site/trunk/build/site/papers/derby_arch.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/derby_htw.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/derby_htw.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/derby_htw.html (original)
+++ incubator/derby/site/trunk/build/site/papers/derby_htw.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/derby_web.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/derby_web.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/derby_web.html (original)
+++ incubator/derby/site/trunk/build/site/papers/derby_web.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/fortune_tut.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/fortune_tut.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/fortune_tut.html (original)
+++ incubator/derby/site/trunk/build/site/papers/fortune_tut.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/index.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/index.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/index.html (original)
+++ incubator/derby/site/trunk/build/site/papers/index.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>
@@ -242,6 +245,9 @@
 <li>Logging &amp; Recovery: <a href="recovery.html">Derby Logging and Recovery</a>
 </li>
         
+<li>Optimizer: <a href="optimizer.html">Derby Optimizer Design</a>
+</li>
+        
 <li>Type System: <a class="external" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Derby Type System</a>
         
 </li>
@@ -250,7 +256,7 @@
 </div>
 
 
-<a name="N1004D"></a><a name="Presentations"></a>
+<a name="N10053"></a><a name="Presentations"></a>
 <h2 class="boxed">Presentations</h2>
 <div class="section">
 <ul>
@@ -309,7 +315,7 @@
 </div>
 
 
-<a name="N100B3"></a><a name="Instruction"></a>
+<a name="N100B9"></a><a name="Instruction"></a>
 <h2 class="boxed">Instruction</h2>
 <div class="section">
 <ul>
@@ -338,7 +344,7 @@
 </div>
 
 
-<a name="N100D2"></a><a name="How+to+Contribute+Papers"></a>
+<a name="N100D8"></a><a name="How+to+Contribute+Papers"></a>
 <h2 class="boxed">How to Contribute Papers</h2>
 <div class="section">
 <p>
@@ -411,7 +417,7 @@
 
 
 <p>
-<em>Last Updated: March 30, 2005</em>
+<em>Last Updated: April 11, 2005</em>
 </p>
 
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/logformats.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/logformats.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/logformats.html (original)
+++ incubator/derby/site/trunk/build/site/papers/logformats.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Added: incubator/derby/site/trunk/build/site/papers/optimizer.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/optimizer.html?view=auto&rev=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/optimizer.html (added)
+++ incubator/derby/site/trunk/build/site/papers/optimizer.html Mon Apr 11 15:22:05 2005
@@ -0,0 +1,522 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.6">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Derby Optimizer Design</title>
+<link type="text/css" href="../skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="../skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="../skin/print.css" rel="stylesheet">
+<link type="text/css" href="../skin/profile.css" rel="stylesheet">
+<script src="../skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="../skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="../skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="../">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">apache</a> &gt; <a href="http://incubator.apache.org/">incubator</a><script src="../skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://incubator.apache.org"><img class="logoImage" alt="" src="../images/apache-incubator-logo.png" title=""></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://incubator.apache.org/derby/"><img class="logoImage" alt="Derby" src="../images/derby-logo.jpg" title="Derby is a zero admin java RDBMS."></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google:');" value="Search the site with google:" size="25" name="q" id="query" type="text">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li>
+<a class="base-not-selected" href="../index.html">Home</a>
+</li>
+<li>
+<a class="base-not-selected" href="../derby_comm.html">Community</a>
+</li>
+<li>
+<a class="base-not-selected" href="../manuals/index.html">Manuals</a>
+</li>
+<li class="current">
+<a class="base-selected" href="../papers/index.html">Papers</a>
+</li>
+<li>
+<a class="base-not-selected" href="../integrate/index.html">Integration</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript" language="JavaScript"><!--
+              document.write("Published: " + document.lastModified);
+              //  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+             
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_1.1', '../skin/')" id="menu_1.1Title" class="menutitle">About</div>
+<div id="menu_1.1" class="menuitemgroup">
+<div class="menuitem">
+<a title="" href="../papers/index.html">Index</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/derby_web.html">Derby Web Site</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_selected_1.2', '../skin/')" id="menu_selected_1.2Title" class="menutitle" style="background-image: url('../skin/images/chapter_open.gif');">Derby Engine</div>
+<div id="menu_selected_1.2" class="selectedmenuitemgroup" style="display: block;">
+<div class="menuitem">
+<a title="" href="../papers/derby_arch.html">Architecture</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/pageformats.html">Disk Page Format</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/derby_htw.html">How Things Work</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/Intersect-design.html">Intersect &amp; Except</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/JDBCImplementation.html">JDBC</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/logformats.html">Log Format</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
+</div>
+<div class="menupage">
+<div class="menupagetitle">Optimizer</div>
+</div>
+<div class="menuitem">
+<a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', '../skin/')" id="menu_1.3Title" class="menutitle">Presentations</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a title="" href="../papers/MiscPresentations.html">Colorado 2004</a>
+</div>
+<div class="menuitem">
+<a title="" href="../papers/ApacheConUs04.html">ApacheCon US '04</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', '../skin/')" id="menu_1.4Title" class="menutitle">Instruction</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a title="" href="../papers/fortune_tut.html">Embedded Tutorial</a>
+</div>
+<div class="menuitem">
+<a title="" href="http://objectinnovations.com/CourseOutlines/168.html">Object Innovations JDBC Course</a>
+</div>
+<div class="menuitem">
+<a title="" href="http://www.ibm.com/developerworks/edu/dm-dw-dm-0412kubasta-i.html?S_TACT=104AHW11&S_CMP=LIB">Cloudscape Detective</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="../skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div id="skinconf-txtlink"></div>
+<div class="trail">
+	        Font size: 
+	          &nbsp;<input value="Reset" class="resetfont" title="Reset text" onclick="ndeSetTextSize('reset'); return false;" type="button">      
+	          &nbsp;<input value="-a" class="smallerfont" title="Shrink text" onclick="ndeSetTextSize('decr'); return false;" type="button">
+	          &nbsp;<input value="+a" class="biggerfont" title="Enlarge text" onclick="ndeSetTextSize('incr'); return false;" type="button">
+</div>
+<h1>Derby Optimizer Design</h1>
+<div class="abstract">This document describes the Derby Optimizer. This is a work-in-progress 
+        derived from Javadoc comments and from explanations Jeffrey Lichtman and 
+        others posted to the Derby lists. Please post questions, comments, and 
+        corrections to derby-dev@db.apache.org. </div>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#overview">Overview</a>
+</li>
+<li>
+<a href="#Example+of+a+5-way+Join">Example of a 5-way Join</a>
+</li>
+<li>
+<a href="#Potential+Improvements+to+the+Optimizer">Potential Improvements to the Optimizer</a>
+</li>
+</ul>
+</div> 
+      
+<a name="N1000F"></a><a name="overview"></a>
+<h2 class="boxed">Overview</h2>
+<div class="section">
+<div class="frame note">
+<div class="label">Note</div>
+<div class="content">Jeffrey Lichtman wrote the original implementation of the optimizer.
+          What follows below is a description of the optimizer he posted to the
+          Derby mailing list.
+        </div>
+</div>
+<p> The optimizer currently only considers left-deep trees. That is, when 
+          looking at joins, it doesn't consider join nodes to the right of other 
+          join nodes - the join trees it considers are skewed to the left. I thought 
+          this would be a good first implementation of the optimizer. Bushy trees 
+          (where the join tree can take any shape) are harder to implement but 
+          are often useful for big multi-way decision-support joins. </p>
+<p> During optimization the join order is represented as an array of Optimizables. 
+          The first element in the array is the leftmost table in the join tree, 
+          and the successive elements in the array are the joining tables in the 
+          left-deep tree. </p>
+<p> The optimizer does an exhaustive search of query plans, going through 
+          all the join orders and, for each join order, all of the access paths 
+          for that order. It remembers the cheapest complete plan it has found 
+          so far, and does cost-based search space pruning. That is, it doesn't 
+          consider plans that are more expensive that the best plan found so far. 
+        </p>
+<p> The optimizer goes through the search space depth-first. In other 
+          words, it adds a table to the join order, then goes through all the 
+          access paths for that table (choosing the cheapest one) before going 
+          on to the next position in the join order. When it gets all the way 
+          to the end of the join order, with all of the tables accounted for, 
+          it considers whether the plan it is looking at is the best one found 
+          so far. If so, it remembers it. Eventually, it exhausts all the tables 
+          to consider at a given depth in the join order, and it has to back up 
+          to the previous position in the join order and consider the next table 
+          there. </p>
+<p> Every time the optimizer adds a table to the prospective join order, 
+          it figures out which parts of the WHERE clause can be evaluated at that 
+          position in the order and pushes these expressions down to that position 
+          so they can be used there. For example, if you have a five-way join 
+          of tables t1, t2, t3, t4 and t5, and the current permutation being considered 
+          is t3 - t1 - t2 (with t3 as the outermost table and t2 as the innermost), 
+          it can evaluate the join "t3.c1 = t2.c2" at the third position in the 
+          join order, so when it adds table t2 it pushes the expression down to 
+          that level in the order. Later, when it removes t2 from the join order 
+          to consider some other table there, it pulls the expression back out 
+          of the join order. </p>
+<p> getNextPermutation() does the adding (and deletion) of tables to the 
+          join order under consideration by the optimizer. getNextDecoratedPermutation() 
+          goes through all the access paths for the current permutation (in the 
+          current implementation, it only has to consider the access paths for 
+          the innermost table in the join order, as the search is done depth-first). 
+          You can think of a decorated permutation as a join order with things 
+          like indexes and join strategies "decorating" the nodes. </p>
+<p> The Optimizable interface represents anything in the query that can 
+          have an access path. In practice an Optimizable is usually a base table, 
+          but it can be anything that appears in a FROM list (for example, standard 
+          SQL allows a UNION in the FROM list). Different Optimizables have different 
+          choices for access paths. The optimizer uses the nextAccessPath() method 
+          on Optimizable to step through the different access paths. These include 
+          different join strategies (such as nested loop and hash join) and different 
+          conglomerates (the base table and the different indexes). Sometimes 
+          the Optimizable has to decide whether a given access path is feasible 
+          (for example, hash join is only feasible for equijoins). </p>
+<p> I'm leaving out a whole lot of details, like how the optimizer does 
+          costing for sort avoidance and how it handles join order dependencies 
+         (e.g. when an "exists" or "in" subquery is flattened to an existence join, 
+         the join order musn't be inverted under the current implementation). </p>
+</div>
+      
+<a name="N10031"></a><a name="Example+of+a+5-way+Join"></a>
+<h2 class="boxed">Example of a 5-way Join</h2>
+<div class="section">
+<p> The optimizer looks at so many 
+          potential plans in a five-way join that it's not feasible to show all 
+          of them in an manually-written explanation. </p>
+<p> Let's take the following query: </p>
+<pre class="code"> 
+select *
+from t1, t2, t3, t4, t5
+where t1.c1 = t2.c2
+and    t1.c3 = t3.c4
+and    t3.c5 = t4.c6
+and    t4.c7 = t5.c8
+and    t1.c9 = 1
+and    t3.c10 = 2
+and    t5.c11 = 3          
+        </pre>
+<p> One possible way to execute this query is to take the tables in the 
+          order of the FROM clause. For each row in a table, join it with the 
+          matching rows from the next table to form a set of joined row. The plan 
+          would look something like this (I hope the formatting doesn't get screwed 
+          up): </p>
+<pre class="code"> 
+             JOIN
+             /      \
+          JOIN    t5
+           /      \
+        JOIN    t4
+        /     \
+    JOIN    t3
+   /       \
+t1       t2
+        </pre>
+<p> This is a left-deep tree. That is. it's skewed to the left. Let's 
+          assume for the sake of argument that each JOIN node is a nested-loop 
+          join. What this means is that each JOIN node gets a row from its left 
+          (outer) table and probes into its right (inner) table to find all the 
+          matching rows. For all but the leftmost JOIN node, the outer table is 
+          also a JOIN. So, at execution, this plan goes all the way down to the 
+          left, gets the first qualifying row from t1, then finds a matching row 
+          in t2. It then puts the matching rows from t1 and t2 together into a 
+          joined row and feeds it up to the JOIN node above it. This JOIN node 
+          uses its outer row to probe into t3 to find a matching row. When it 
+          finds such a row, it puts together its outer and inner rows into a joined 
+          row, which it feeds to the JOIN node above it. It keeps doing this all 
+          the way up the plan. When the top JOIN node finds a matching row in 
+          t5, it returns that row from the SELECT statement. </p>
+<p> More sophisticated optimizers consider "bushy" trees, which can take 
+          shapes other than the left-deep shape shown above. For example, it might 
+          consider a plan with the following join tree: </p>
+<pre class="code">
+          JOIN
+         /       \
+   JOIN       JOIN
+   /     \      /      \
+t1    t2    t3    JOIN
+                     /     \
+                    t4    t5
+        </pre>
+<p> As you can see, the tables are in the same order but the shape of 
+          the join tree is entirely different. As I mentioned in my original mail, 
+          bushy trees are harder to implement but they are good for some types 
+          of big decision-support queries. </p>
+<p> Because the Derby optimizer only models left-deep trees, it doesn't 
+          have to model the shape of the tree. All it has to model is the order 
+          of the tables in the tree (since the tree is always the same shape for 
+          a given number of tables). It does this the simple way: by using an 
+          array representing the assignment of tables to positions in the join 
+          order. </p>
+<p> The basic idea of a cost-based optimizer is to come up with an estimated 
+          cost for all the possible execution plans for a query and to choose 
+          the cheapest plan. The number of possible plans grows with the number 
+          of tables, indexes, join strategies, etc. Most optimizers do something 
+          to reduce the search space, so that for big queries the best plan (or 
+          a reasonable plan) is found in an acceptable length of time. One way 
+          the Derby optimizer prunes its search space is by skipping over plans 
+          that it knows will be more expensive than the best plan it's found so 
+          far. </p>
+<p> The optimizer does this by depth-first searching. That is, rather 
+          than coming up with a join order for all the tables in the query and 
+          then considering all the access paths for those tables, it adds one 
+          table at a time to the join order and figures out the best access path 
+          for that table (in its current spot in the join order) before going 
+          on to add another table to the join order. While doing this, it keeps 
+          track of the cost of the plan its considering so far. If, when it adds 
+          a table to the join order, it finds that this makes the current plan 
+          under consideration more costly than the best plan found so far, it 
+          abandons the consideration of that table in that position of the join 
+          order. By doing this, the optimizer can avoid considering many join 
+          orders. This is important when there are a lot of tables in the query, 
+          because the number of join orders is the factorial of the number of 
+          tables. </p>
+<p> For example, let's say that in the sample query given above, the optimizer 
+          has already found a complete plan with an estimated cost of 10000. Now 
+          suppose it is considering the following partial join order: </p>
+<p> (outer) t4 - t5 (inner) </p>
+<p> Let's say this partial plan has an estimated cost of 2000. Now suppose 
+          the optimizer considers placing the table t1 as the next table in the 
+          join order: </p>
+<p> (outer) t4 - t5 - t2 (inner) </p>
+<p> Note that the query has no direct join clause between t1 and either 
+          t4 or t5. The optimizer would go through all possible access paths for 
+          t2 in this context, and would see that with no useful qualification 
+          on the table it would have to do a full scan of t2 for every outer row 
+          resulting from the join of t4 and t5. If t2 is anything but a very small 
+          table, it could be expensive. Let's say the estimated total best cost 
+          for t2 in this position in the join order is 50000. That would make 
+          the total cost of the query equal to 52000, which is higher than the 
+          cost of the best plan found so far (10000). So it doesn't make sense 
+          to look at this join order any further. Rather than consider the following 
+          join orders: </p>
+<p> (outer) t4 - t5 - t2 - t1 - t3 (inner) </p>
+<p> (outer) t4 - t5 - t2 - t3 - t1 (inner) </p>
+<p> the optimizer abandons consideration of any plan starting with t4 
+          - t5 - t2. </p>
+</div>
+      
+<a name="N10077"></a><a name="Potential+Improvements+to+the+Optimizer"></a>
+<h2 class="boxed">Potential Improvements to the Optimizer</h2>
+<div class="section">
+<p> It's hard to consider the optimizer by itself. Many optimizer enhancements
+          would work with changes in other areas, especially execution. </p>
+<p> One area that I think needs work is hash joins. The current implementation 
+          uses an in-memory hash table. The optimizer avoids using the hash join 
+          strategy when it estimates that the hash table would use too much memory. 
+          There are a few problems with this: the optimizer doesn't really know 
+          how much memory is available, its estimates of memory usage are crude 
+          and possibly inaccurate, and it's possible for the query to fail if 
+          a hash table gets too big during execution. </p>
+<p> I would like to see hash tables that spill to disk. Ideally, the hash 
+          table should be an in-memory structure with a conglomerate as a backing 
+          store. I would want the backing store to be used only when necessary 
+          - that is, only when the hash table grows too big. The biggest problem 
+          with this idea is how to estimate the cost of building and maintaining 
+          the table. One approach could be to put a limit on the number of in-memory 
+          rows in the hash table and use a statistical formula for the cost of 
+          reading a row, using the number of in-memory rows versus the total number 
+          of rows to estimate the chances of finding a row in memory. </p>
+<p> Another approach could be to use weak references in managing the hash 
+          table (a weak reference is a Java construct that allows the garbage 
+          collector to nominate an object for garbage collection even when it 
+          has a reference to it). Weak references are useful for memory caches 
+          that adjust themselves to the memory available to the JVM. One of our 
+          original ideas for Derby (nee Cloudscape) is that it should be a low-maintenance 
+          DBMS, with little intervention required to keep a working system running. 
+          A self-managing cache could help with this - it would adjust itself 
+          to environments with different amounts of memory (although small-memory 
+          environments could hurt performance). I don't know how the optimizer 
+          would estimate the cost for building and maintaining a hash table in 
+          this case. </p>
+<p> I also think merge joins are worth considering, especially if nothing 
+          is done about hash joins. Merge joins are useful for many of the same 
+          types of queries as hash joins and, since they use the sorter (assuming 
+          the joined rows are not already ordered on the join colums) they can 
+          work even for large tables (because the sorter spills to disk if the 
+          data being sorted won't fit in memory). Merge joins can have a very 
+          low cost if the rows are already ordered (which they can be if there 
+          are indexes on the join columns). Merge joins can also work well with 
+          sort avoidance if the ordering for the merge is the same as for the 
+          ORDER BY clause. </p>
+<p> Switching gears, another problem is the cost of user-defined functions. 
+          What do you do with a query like this?: </p>
+<pre class="code"> 
+select *
+from t1, t2, t3
+where t1.c1 = user_defined_function(t2.c2)
+and t2.c3 = t3.c4
+        </pre>
+<p> If the user-defined function is cheap and there's an index on t1.c1, 
+          you may want to call the function for every row in t2 and use the result 
+          to probe into the index on t1. On the other hand, if the function is 
+          expensive, you may want to try to execute it as few times as possible, 
+          which could make it unfeasible to use it to probe into t1. Currently 
+          Derby has no modeling for the cost of user-defined functions and avoids 
+          pushing them down into the query plan (that is, it calculates user-defined 
+          functions as late as possible before returning the rows from the query). 
+        </p>
+<p> This may seem trivial, but keep in mind that a user-defined function 
+          can do anything, from something as simple as returning a constant to 
+          as complex as executing a query on another DBMS. It really can be important 
+          to know the cost of a user-defined function. </p>
+<p> One possible approach would be to have a way of telling the DBMS to 
+          execute the function and remember how long it takes, and then store 
+          this in a system catalog for the optimizer to use. Another approach 
+          would be to allow the user to register the cost of a function as low, 
+          medium or high. </p>
+<p> Switching gears again, another feature I think would be generically 
+          useful would be indexes on expressions (instead of just on columns). 
+          One potential use for this feature is case-independent searches (which 
+          can be done now, but which tend to be slow because the functions involved 
+          prevent the optimizer from using an index). The biggest problem here 
+          is the representation of index definitions in the SYSCONGLOMERATES system 
+          table (which assumes that an index column corresponds to a column in 
+          a base table). </p>
+<p> Another area for investigation is the flattening of subqueries to 
+          joins. Currently, the language compiler flattens IN and EXISTS subqueries 
+          to types of joins. This is good because it gives the optimizer more 
+          options in choosing query plans for these types of subqueries, and it 
+          also allows the optimizer to get better cost estimates (for complex 
+          technical reasons I won't go into here). There are other types of subqueries 
+          that could be flattened - for example, a NOT IN or NOT EXISTS subquery 
+          can often be flattened to an outer join. </p>
+<p> Another thing that could be done is to allow the optimizer to invert 
+          the join order of IN and EXISTS subqueries. As mentioned above, these 
+          subqueries are often flattened to joins. The joins are special, in that 
+          at execution it looks for only one match in the inner table per row 
+          from the outer table. This strategy requires that the table in the IN 
+          or EXISTS subquery remain inner to the joining table from outside the 
+          subquery. It would be possible to invert the join order if a sort were 
+          done on the subquery table to eliminate duplicate joining values. (Actually, 
+          I'm oversimplifying here, but I would have to write a treatise otherwise.) 
+        </p>
+</div>
+    
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+ Copyright &copy; 2004-2005 Apache Software Foundation</div>
+<div id="feedback">
+    Send feedback about the website to:
+  <a id="feedbackto" href="mailto:derby-dev@db.apache.org?subject=Feedback%C2%A0papers/optimizer.html">derby-dev@db.apache.org</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Modified: incubator/derby/site/trunk/build/site/papers/pageformats.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/pageformats.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/pageformats.html (original)
+++ incubator/derby/site/trunk/build/site/papers/pageformats.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/papers/recovery.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/papers/recovery.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/papers/recovery.html (original)
+++ incubator/derby/site/trunk/build/site/papers/recovery.html Mon Apr 11 15:22:05 2005
@@ -141,6 +141,9 @@
 <div class="menupagetitle">Logging &amp; Recovery</div>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/build/site/releases/release-10.0.2.1.html
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/build/site/releases/release-10.0.2.1.html?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/build/site/releases/release-10.0.2.1.html (original)
+++ incubator/derby/site/trunk/build/site/releases/release-10.0.2.1.html Mon Apr 11 15:22:05 2005
@@ -177,6 +177,9 @@
 <a title="" href="../papers/recovery.html">Logging &amp; Recovery</a>
 </div>
 <div class="menuitem">
+<a title="" href="../papers/optimizer.html">Optimizer</a>
+</div>
+<div class="menuitem">
 <a title="" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Type System</a>
 </div>
 </div>

Modified: incubator/derby/site/trunk/src/documentation/content/xdocs/papers/index.xml
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/src/documentation/content/xdocs/papers/index.xml?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/src/documentation/content/xdocs/papers/index.xml (original)
+++ incubator/derby/site/trunk/src/documentation/content/xdocs/papers/index.xml Mon Apr 11 15:22:05 2005
@@ -26,6 +26,7 @@
 		Notes</a></li>
 	<li>Log Format: <a href="logformats.html">Derby Write Ahead Log Format</a></li>
 	<li>Logging &amp; Recovery: <a href="recovery.html">Derby Logging and Recovery</a></li>
+        <li>Optimizer: <a href="optimizer.html">Derby Optimizer Design</a></li>
         <li>Type System: <a href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html">Derby Type System</a>
         </li>
 </ul>
@@ -168,6 +169,6 @@
 
 </section>
 
-<p><em>Last Updated: March 30, 2005</em></p>
+<p><em>Last Updated: April 11, 2005</em></p>
 </body>
 </document>

Added: incubator/derby/site/trunk/src/documentation/content/xdocs/papers/optimizer.xml
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/src/documentation/content/xdocs/papers/optimizer.xml?view=auto&rev=160972
==============================================================================
--- incubator/derby/site/trunk/src/documentation/content/xdocs/papers/optimizer.xml (added)
+++ incubator/derby/site/trunk/src/documentation/content/xdocs/papers/optimizer.xml Mon Apr 11 15:22:05 2005
@@ -0,0 +1,292 @@
+<?xml version="1.0"?>
+  <!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+  <document> 
+    <header> 
+      <title>Derby Optimizer Design</title>
+      <abstract>This document describes the Derby Optimizer. This is a work-in-progress 
+        derived from Javadoc comments and from explanations Jeffrey Lichtman and 
+        others posted to the Derby lists. Please post questions, comments, and 
+        corrections to derby-dev@db.apache.org. </abstract>
+    </header>
+    <body> 
+      <section id="overview"> 
+        <title>Overview</title>
+        <note>Jeffrey Lichtman wrote the original implementation of the optimizer.
+          What follows below is a description of the optimizer he posted to the
+          Derby mailing list.
+        </note>
+        <p> The optimizer currently only considers left-deep trees. That is, when 
+          looking at joins, it doesn't consider join nodes to the right of other 
+          join nodes - the join trees it considers are skewed to the left. I thought 
+          this would be a good first implementation of the optimizer. Bushy trees 
+          (where the join tree can take any shape) are harder to implement but 
+          are often useful for big multi-way decision-support joins. </p>
+        <p> During optimization the join order is represented as an array of Optimizables. 
+          The first element in the array is the leftmost table in the join tree, 
+          and the successive elements in the array are the joining tables in the 
+          left-deep tree. </p>
+        <p> The optimizer does an exhaustive search of query plans, going through 
+          all the join orders and, for each join order, all of the access paths 
+          for that order. It remembers the cheapest complete plan it has found 
+          so far, and does cost-based search space pruning. That is, it doesn't 
+          consider plans that are more expensive that the best plan found so far. 
+        </p>
+        <p> The optimizer goes through the search space depth-first. In other 
+          words, it adds a table to the join order, then goes through all the 
+          access paths for that table (choosing the cheapest one) before going 
+          on to the next position in the join order. When it gets all the way 
+          to the end of the join order, with all of the tables accounted for, 
+          it considers whether the plan it is looking at is the best one found 
+          so far. If so, it remembers it. Eventually, it exhausts all the tables 
+          to consider at a given depth in the join order, and it has to back up 
+          to the previous position in the join order and consider the next table 
+          there. </p>
+        <p> Every time the optimizer adds a table to the prospective join order, 
+          it figures out which parts of the WHERE clause can be evaluated at that 
+          position in the order and pushes these expressions down to that position 
+          so they can be used there. For example, if you have a five-way join 
+          of tables t1, t2, t3, t4 and t5, and the current permutation being considered 
+          is t3 - t1 - t2 (with t3 as the outermost table and t2 as the innermost), 
+          it can evaluate the join "t3.c1 = t2.c2" at the third position in the 
+          join order, so when it adds table t2 it pushes the expression down to 
+          that level in the order. Later, when it removes t2 from the join order 
+          to consider some other table there, it pulls the expression back out 
+          of the join order. </p>
+        <p> getNextPermutation() does the adding (and deletion) of tables to the 
+          join order under consideration by the optimizer. getNextDecoratedPermutation() 
+          goes through all the access paths for the current permutation (in the 
+          current implementation, it only has to consider the access paths for 
+          the innermost table in the join order, as the search is done depth-first). 
+          You can think of a decorated permutation as a join order with things 
+          like indexes and join strategies "decorating" the nodes. </p>
+        <p> The Optimizable interface represents anything in the query that can 
+          have an access path. In practice an Optimizable is usually a base table, 
+          but it can be anything that appears in a FROM list (for example, standard 
+          SQL allows a UNION in the FROM list). Different Optimizables have different 
+          choices for access paths. The optimizer uses the nextAccessPath() method 
+          on Optimizable to step through the different access paths. These include 
+          different join strategies (such as nested loop and hash join) and different 
+          conglomerates (the base table and the different indexes). Sometimes 
+          the Optimizable has to decide whether a given access path is feasible 
+          (for example, hash join is only feasible for equijoins). </p>
+        <p> I'm leaving out a whole lot of details, like how the optimizer does 
+          costing for sort avoidance and how it handles join order dependencies 
+         (e.g. when an "exists" or "in" subquery is flattened to an existence join, 
+         the join order musn't be inverted under the current implementation). </p>
+      </section>
+      <section> 
+        <title>Example of a 5-way Join</title>
+        <p> The optimizer looks at so many 
+          potential plans in a five-way join that it's not feasible to show all 
+          of them in an manually-written explanation. </p>
+        <p> Let's take the following query: </p>
+        <source> 
+select *
+from t1, t2, t3, t4, t5
+where t1.c1 = t2.c2
+and    t1.c3 = t3.c4
+and    t3.c5 = t4.c6
+and    t4.c7 = t5.c8
+and    t1.c9 = 1
+and    t3.c10 = 2
+and    t5.c11 = 3          
+        </source>
+        <p> One possible way to execute this query is to take the tables in the 
+          order of the FROM clause. For each row in a table, join it with the 
+          matching rows from the next table to form a set of joined row. The plan 
+          would look something like this (I hope the formatting doesn't get screwed 
+          up): </p>
+        <source> 
+             JOIN
+             /      \
+          JOIN    t5
+           /      \
+        JOIN    t4
+        /     \
+    JOIN    t3
+   /       \
+t1       t2
+        </source>
+        <p> This is a left-deep tree. That is. it's skewed to the left. Let's 
+          assume for the sake of argument that each JOIN node is a nested-loop 
+          join. What this means is that each JOIN node gets a row from its left 
+          (outer) table and probes into its right (inner) table to find all the 
+          matching rows. For all but the leftmost JOIN node, the outer table is 
+          also a JOIN. So, at execution, this plan goes all the way down to the 
+          left, gets the first qualifying row from t1, then finds a matching row 
+          in t2. It then puts the matching rows from t1 and t2 together into a 
+          joined row and feeds it up to the JOIN node above it. This JOIN node 
+          uses its outer row to probe into t3 to find a matching row. When it 
+          finds such a row, it puts together its outer and inner rows into a joined 
+          row, which it feeds to the JOIN node above it. It keeps doing this all 
+          the way up the plan. When the top JOIN node finds a matching row in 
+          t5, it returns that row from the SELECT statement. </p>
+        <p> More sophisticated optimizers consider "bushy" trees, which can take 
+          shapes other than the left-deep shape shown above. For example, it might 
+          consider a plan with the following join tree: </p>
+        <source>
+          JOIN
+         /       \
+   JOIN       JOIN
+   /     \      /      \
+t1    t2    t3    JOIN
+                     /     \
+                    t4    t5
+        </source>
+        <p> As you can see, the tables are in the same order but the shape of 
+          the join tree is entirely different. As I mentioned in my original mail, 
+          bushy trees are harder to implement but they are good for some types 
+          of big decision-support queries. </p>
+        <p> Because the Derby optimizer only models left-deep trees, it doesn't 
+          have to model the shape of the tree. All it has to model is the order 
+          of the tables in the tree (since the tree is always the same shape for 
+          a given number of tables). It does this the simple way: by using an 
+          array representing the assignment of tables to positions in the join 
+          order. </p>
+        <p> The basic idea of a cost-based optimizer is to come up with an estimated 
+          cost for all the possible execution plans for a query and to choose 
+          the cheapest plan. The number of possible plans grows with the number 
+          of tables, indexes, join strategies, etc. Most optimizers do something 
+          to reduce the search space, so that for big queries the best plan (or 
+          a reasonable plan) is found in an acceptable length of time. One way 
+          the Derby optimizer prunes its search space is by skipping over plans 
+          that it knows will be more expensive than the best plan it's found so 
+          far. </p>
+        <p> The optimizer does this by depth-first searching. That is, rather 
+          than coming up with a join order for all the tables in the query and 
+          then considering all the access paths for those tables, it adds one 
+          table at a time to the join order and figures out the best access path 
+          for that table (in its current spot in the join order) before going 
+          on to add another table to the join order. While doing this, it keeps 
+          track of the cost of the plan its considering so far. If, when it adds 
+          a table to the join order, it finds that this makes the current plan 
+          under consideration more costly than the best plan found so far, it 
+          abandons the consideration of that table in that position of the join 
+          order. By doing this, the optimizer can avoid considering many join 
+          orders. This is important when there are a lot of tables in the query, 
+          because the number of join orders is the factorial of the number of 
+          tables. </p>
+        <p> For example, let's say that in the sample query given above, the optimizer 
+          has already found a complete plan with an estimated cost of 10000. Now 
+          suppose it is considering the following partial join order: </p>
+        <p> (outer) t4 - t5 (inner) </p>
+        <p> Let's say this partial plan has an estimated cost of 2000. Now suppose 
+          the optimizer considers placing the table t1 as the next table in the 
+          join order: </p>
+        <p> (outer) t4 - t5 - t2 (inner) </p>
+        <p> Note that the query has no direct join clause between t1 and either 
+          t4 or t5. The optimizer would go through all possible access paths for 
+          t2 in this context, and would see that with no useful qualification 
+          on the table it would have to do a full scan of t2 for every outer row 
+          resulting from the join of t4 and t5. If t2 is anything but a very small 
+          table, it could be expensive. Let's say the estimated total best cost 
+          for t2 in this position in the join order is 50000. That would make 
+          the total cost of the query equal to 52000, which is higher than the 
+          cost of the best plan found so far (10000). So it doesn't make sense 
+          to look at this join order any further. Rather than consider the following 
+          join orders: </p>
+        <p> (outer) t4 - t5 - t2 - t1 - t3 (inner) </p>
+        <p> (outer) t4 - t5 - t2 - t3 - t1 (inner) </p>
+        <p> the optimizer abandons consideration of any plan starting with t4 
+          - t5 - t2. </p>
+      </section>
+      <section> 
+        <title>Potential Improvements to the Optimizer</title>
+        <p> It's hard to consider the optimizer by itself. Many optimizer enhancements
+          would work with changes in other areas, especially execution. </p>
+        <p> One area that I think needs work is hash joins. The current implementation 
+          uses an in-memory hash table. The optimizer avoids using the hash join 
+          strategy when it estimates that the hash table would use too much memory. 
+          There are a few problems with this: the optimizer doesn't really know 
+          how much memory is available, its estimates of memory usage are crude 
+          and possibly inaccurate, and it's possible for the query to fail if 
+          a hash table gets too big during execution. </p>
+        <p> I would like to see hash tables that spill to disk. Ideally, the hash 
+          table should be an in-memory structure with a conglomerate as a backing 
+          store. I would want the backing store to be used only when necessary 
+          - that is, only when the hash table grows too big. The biggest problem 
+          with this idea is how to estimate the cost of building and maintaining 
+          the table. One approach could be to put a limit on the number of in-memory 
+          rows in the hash table and use a statistical formula for the cost of 
+          reading a row, using the number of in-memory rows versus the total number 
+          of rows to estimate the chances of finding a row in memory. </p>
+        <p> Another approach could be to use weak references in managing the hash 
+          table (a weak reference is a Java construct that allows the garbage 
+          collector to nominate an object for garbage collection even when it 
+          has a reference to it). Weak references are useful for memory caches 
+          that adjust themselves to the memory available to the JVM. One of our 
+          original ideas for Derby (nee Cloudscape) is that it should be a low-maintenance 
+          DBMS, with little intervention required to keep a working system running. 
+          A self-managing cache could help with this - it would adjust itself 
+          to environments with different amounts of memory (although small-memory 
+          environments could hurt performance). I don't know how the optimizer 
+          would estimate the cost for building and maintaining a hash table in 
+          this case. </p>
+        <p> I also think merge joins are worth considering, especially if nothing 
+          is done about hash joins. Merge joins are useful for many of the same 
+          types of queries as hash joins and, since they use the sorter (assuming 
+          the joined rows are not already ordered on the join colums) they can 
+          work even for large tables (because the sorter spills to disk if the 
+          data being sorted won't fit in memory). Merge joins can have a very 
+          low cost if the rows are already ordered (which they can be if there 
+          are indexes on the join columns). Merge joins can also work well with 
+          sort avoidance if the ordering for the merge is the same as for the 
+          ORDER BY clause. </p>
+        <p> Switching gears, another problem is the cost of user-defined functions. 
+          What do you do with a query like this?: </p>
+        <source> 
+select *
+from t1, t2, t3
+where t1.c1 = user_defined_function(t2.c2)
+and t2.c3 = t3.c4
+        </source>
+        <p> If the user-defined function is cheap and there's an index on t1.c1, 
+          you may want to call the function for every row in t2 and use the result 
+          to probe into the index on t1. On the other hand, if the function is 
+          expensive, you may want to try to execute it as few times as possible, 
+          which could make it unfeasible to use it to probe into t1. Currently 
+          Derby has no modeling for the cost of user-defined functions and avoids 
+          pushing them down into the query plan (that is, it calculates user-defined 
+          functions as late as possible before returning the rows from the query). 
+        </p>
+        <p> This may seem trivial, but keep in mind that a user-defined function 
+          can do anything, from something as simple as returning a constant to 
+          as complex as executing a query on another DBMS. It really can be important 
+          to know the cost of a user-defined function. </p>
+        <p> One possible approach would be to have a way of telling the DBMS to 
+          execute the function and remember how long it takes, and then store 
+          this in a system catalog for the optimizer to use. Another approach 
+          would be to allow the user to register the cost of a function as low, 
+          medium or high. </p>
+        <p> Switching gears again, another feature I think would be generically 
+          useful would be indexes on expressions (instead of just on columns). 
+          One potential use for this feature is case-independent searches (which 
+          can be done now, but which tend to be slow because the functions involved 
+          prevent the optimizer from using an index). The biggest problem here 
+          is the representation of index definitions in the SYSCONGLOMERATES system 
+          table (which assumes that an index column corresponds to a column in 
+          a base table). </p>
+        <p> Another area for investigation is the flattening of subqueries to 
+          joins. Currently, the language compiler flattens IN and EXISTS subqueries 
+          to types of joins. This is good because it gives the optimizer more 
+          options in choosing query plans for these types of subqueries, and it 
+          also allows the optimizer to get better cost estimates (for complex 
+          technical reasons I won't go into here). There are other types of subqueries 
+          that could be flattened - for example, a NOT IN or NOT EXISTS subquery 
+          can often be flattened to an outer join. </p>
+        <p> Another thing that could be done is to allow the optimizer to invert 
+          the join order of IN and EXISTS subqueries. As mentioned above, these 
+          subqueries are often flattened to joins. The joins are special, in that 
+          at execution it looks for only one match in the inner table per row 
+          from the outer table. This strategy requires that the table in the IN 
+          or EXISTS subquery remain inner to the joining table from outside the 
+          subquery. It would be possible to invert the join order if a sort were 
+          done on the subquery table to eliminate duplicate joining values. (Actually, 
+          I'm oversimplifying here, but I would have to write a treatise otherwise.) 
+        </p>
+      </section>
+    </body>
+    <footer> 
+      <legal></legal>
+    </footer>
+  </document>

Modified: incubator/derby/site/trunk/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewcvs/incubator/derby/site/trunk/src/documentation/content/xdocs/site.xml?view=diff&r1=160971&r2=160972
==============================================================================
--- incubator/derby/site/trunk/src/documentation/content/xdocs/site.xml (original)
+++ incubator/derby/site/trunk/src/documentation/content/xdocs/site.xml Mon Apr 11 15:22:05 2005
@@ -31,6 +31,7 @@
       <jdbc      label="JDBC"                   href="JDBCImplementation.html"/>
       <log       label="Log Format"             href="logformats.html"/>
       <recover   label="Logging &amp; Recovery" href="recovery.html"/>
+      <optimizer label="Optimizer"              href="optimizer.html"/>
       <types     label="Type System" href="https://svn.apache.org/viewcvs.cgi/*checkout*/incubator/derby/code/trunk/java/engine/org/apache/derby/iapi/types/package.html"/>
   </engine>