You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by mc...@apache.org on 2006/02/13 08:25:37 UTC

svn commit: r377317 - in /lucene/hadoop/trunk/src/java/org/apache/hadoop: dfs/ fs/

Author: mc
Date: Sun Feb 12 23:25:35 2006
New Revision: 377317

URL: http://svn.apache.org/viewcvs?rev=377317&view=rev
Log:

  Add a bunch of updated comments and JavaDocs to
the Distributed File System package.


Modified:
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/BlockCommand.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/ClientProtocol.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DFSClient.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeInfo.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeProtocol.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DistributedFileSystem.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/NameNode.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/package.html
    lucene/hadoop/trunk/src/java/org/apache/hadoop/fs/FileSystem.java

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/BlockCommand.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/BlockCommand.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/BlockCommand.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/BlockCommand.java Sun Feb 12 23:25:35 2006
@@ -20,10 +20,13 @@
 import java.io.*;
 
 /****************************************************
- * A BlockCommand is an instruction to a datanode regarding
- * some blocks under its control
+ * A BlockCommand is an instruction to a datanode 
+ * regarding some blocks under its control.  It tells
+ * the DataNode to either invalidate a set of indicated
+ * blocks, or to copy a set of indicated blocks to 
+ * another DataNode.
  * 
- * @author Michael Cafarella
+ * @author Mike Cafarella
  ****************************************************/
 class BlockCommand implements Writable {
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/ClientProtocol.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/ClientProtocol.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/ClientProtocol.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/ClientProtocol.java Sun Feb 12 23:25:35 2006
@@ -18,65 +18,98 @@
 import java.io.*;
 
 /**********************************************************************
- * Protocol that an DFS client uses to communicate with the NameNode.
- * It's used to manipulate the namespace, and obtain datanode info.
+ * ClientProtocol is used by a piece of DFS user code to communicate 
+ * with the NameNode.  User code can manipulate the directory namespace, 
+ * as well as open/close file streams, etc.
  *
  * @author Mike Cafarella
  **********************************************************************/
 interface ClientProtocol {
 
-    /**
-     * Open an existing file.  Get back block and datanode info
+    ///////////////////////////////////////
+    // File contents
+    ///////////////////////////////////////
+    /**
+     * Open an existing file, at the given name.  Returns block 
+     * and DataNode info.  The client will then have to contact
+     * each indicated DataNode to obtain the actual data.  There
+     * is no need to call close() or any other function after
+     * calling open().
      */
     public LocatedBlock[] open(String src) throws IOException;
 
     /**
-     * Create a new file.  Get back block and datanode info
+     * Create a new file.  Get back block and datanode info,
+     * which describes where the first block should be written.
+     *
+     * Successfully calling this method prevents any other 
+     * client from creating a file under the given name, but
+     * the caller must invoke complete() for the file to be
+     * added to the filesystem.
+     *
+     * Blocks have a maximum size.  Clients that intend to
+     * create multi-block files must also use reportWrittenBlock()
+     * and addBlock().
      */
     public LocatedBlock create(String src, String clientName, boolean overwrite) throws IOException;
 
     /**
-     * The client wants to write an additional block to the indicated
-     * filename (which must currently be open for writing).  Return
-     * block and datanode info.  A null response means the caller
-     * should attempt the call again.
+     * A client that has written a block of data can report completion
+     * back to the NameNode with reportWrittenBlock().  Clients cannot
+     * obtain an additional block until the previous one has either been 
+     * reported as written or abandoned.
      */
-    public LocatedBlock addBlock(String src) throws IOException;
+    public void reportWrittenBlock(LocatedBlock b) throws IOException;
 
     /**
-     * The client wants to report a block it has just successfully
-     * written to one or more datanodes.  Client-written blocks are
-     * always reported by the client, not by the datanode.
+     * If the client has not yet called reportWrittenBlock(), it can
+     * give up on it by calling abandonBlock().  The client can then
+     * either obtain a new block, or complete or abandon the file.
+     *
+     * Any partial writes to the block will be garbage-collected.
      */
-    public void reportWrittenBlock(LocatedBlock b) throws IOException;
+    public void abandonBlock(Block b, String src) throws IOException;
 
     /**
-     * The client wants to abandon writing to the indicated block,
-     * part of the indicated (currently-open) filename.
+     * A client that wants to write an additional block to the 
+     * indicated filename (which must currently be open for writing)
+     * should call addBlock().  
+     *
+     * addBlock() returns block and datanode info, just like the initial
+     * call to create().  
+     *
+     * A null response means the NameNode could not allocate a block,
+     * and that the caller should try again.
      */
-    public void abandonBlock(Block b, String src) throws IOException;
+    public LocatedBlock addBlock(String src) throws IOException;
 
     /**
-     * The client wants to abandon writing to the current file, and
-     * let anyone else grab it.
+     * A client that wants to abandon writing to the current file
+     * should call abandonFileInProgress().  After this call, any
+     * client can call create() to obtain the filename.
+     *
+     * Any blocks that have been written for the file will be 
+     * garbage-collected.
      */
     public void abandonFileInProgress(String src) throws IOException;
 
     /**
      * The client is done writing data to the given filename, and would 
-     * like to complete it.  Returns whether the file has been closed
-     * correctly (true) or whether caller should try again (false).
-     * (Because the namenode is waiting for a block to complete).
+     * like to complete it.  
+     *
+     * The function returns whether the file has been closed successfully.
+     * If the function returns false, the caller should try again.
+     *
+     * A call to complete() will not return true until all the file's
+     * blocks have been replicated the minimum number of times.  Thus,
+     * DataNode failures may cause a client to call complete() several
+     * times before succeeding.
      */
     public boolean complete(String src, String clientName) throws IOException;
-    
-    /**
-     * The client wants to read the indicated filename at a certain offset.
-     * Return a list of hostnames where the data can be found.  (Return
-     * a set of hostnames for every block.)
-     */
-    public String[][] getHints(String src, long start, long len) throws IOException;
 
+    ///////////////////////////////////////
+    // Namespace management
+    ///////////////////////////////////////
     /**
      * Rename an item in the fs namespace
      */
@@ -104,36 +137,71 @@
     public boolean mkdirs(String src) throws IOException;
 
     /**
-     * The client is trying to obtain a lock.  Return whether the lock has
-     * been seized correctly (true), or whether the client should try again
-     * (false).
+     * Get a listing of the indicated directory
+     */
+    public DFSFileInfo[] getListing(String src) throws IOException;
+
+    ///////////////////////////////////////
+    // System issues and management
+    ///////////////////////////////////////
+    /**
+     * getHints() returns a list of hostnames that store data for
+     * a specific file region.  It returns a set of hostnames for 
+     * every block within the indicated region.
+     *
+     * This function is very useful when writing code that considers
+     * data-placement when performing operations.  For example, the
+     * MapReduce system tries to schedule tasks on the same machines
+     * as the data-block the task processes. 
+     */
+    public String[][] getHints(String src, long start, long len) throws IOException;
+    /**
+     * obtainLock() is used for lock managemnet.  It returns true if
+     * the lock has been seized correctly.  It returns false if the
+     * lock could not be obtained, and the client should try again.
+     *
+     * Locking is a part of most filesystems and is useful for a
+     * number of inter-process synchronization tasks.
      */
     public boolean obtainLock(String src, String clientName, boolean exclusive) throws IOException;
 
     /**
-     * The client wants to release a held lock.  Return whether the lock was
-     * correctly released (true), or whether the client should wait and try the 
-     * call again (false).
+     * releaseLock() is called if the client would like to release
+     * a held lock.  It returns true if the lock is correctly released.
+     * It returns false if the client should wait and try again.
      */
     public boolean releaseLock(String src, String clientName) throws IOException;
 
     /**
-     * The client machine wants to obtain a lease
+     * Client programs can cause stateful changes in the NameNode
+     * that affect other clients.  A client may obtain a file and 
+     * neither abandon nor complete it.  A client might hold a series
+     * of locks that prevent other clients from proceeding.
+     * Clearly, it would be bad if a client held a bunch of locks
+     * that it never gave up.  This can happen easily if the client
+     * dies unexpectedly.
+     *
+     * So, the NameNode will revoke the locks and live file-creates
+     * for clients that it thinks have died.  A client tells the
+     * NameNode that it is still alive by periodically calling
+     * renewLease().  If a certain amount of time passes since
+     * the last call to renewLease(), the NameNode assumes the
+     * client has died.
      */
     public void renewLease(String clientName) throws IOException;
 
     /**
-     * Get a listing of the indicated directory
-     */
-    public DFSFileInfo[] getListing(String src) throws IOException;
-
-    /**
      * Get a set of statistics about the filesystem.
+     * Right now, only two values are returned.
+     * [0] contains the total storage capacity of the system,
+     *     in bytes.
+     * [1] contains the available storage of the system, in bytes.
      */
     public long[] getStats() throws IOException;
 
     /**
      * Get a full report on the system's current datanodes.
+     * One DatanodeInfo object is returned for each DataNode.
      */
     public DatanodeInfo[] getDatanodeReport() throws IOException;
 }

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DFSClient.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DFSClient.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DFSClient.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DFSClient.java Sun Feb 12 23:25:35 2006
@@ -27,8 +27,15 @@
 import java.util.logging.*;
 
 /********************************************************
- * DFSClient can connect to a Hadoop Filesystem and perform basic file tasks.
- * Connects to a namenode daemon.
+ * DFSClient can connect to a Hadoop Filesystem and 
+ * perform basic file tasks.  It uses the ClientProtocol
+ * to communicate with a NameNode daemon, and connects 
+ * directly to DataNodes to read/write block data.
+ *
+ * Hadoop DFS users should obtain an instance of 
+ * DistributedFileSystem, which uses DFSClient to handle
+ * filesystem tasks.
+ *
  * @author Mike Cafarella, Tessa MacDuff
  ********************************************************/
 class DFSClient implements FSConstants {
@@ -41,7 +48,8 @@
     Daemon leaseChecker;
 
 
-    /** Create a new DFSClient connected to the given namenode server.
+    /** 
+     * Create a new DFSClient connected to the given namenode server.
      */
     public DFSClient(InetSocketAddress nameNodeAddr, Configuration conf) {
         this.namenode = (ClientProtocol) RPC.getProxy(ClientProtocol.class, nameNodeAddr, conf);

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DataNode.java Sun Feb 12 23:25:35 2006
@@ -25,14 +25,35 @@
 import java.util.logging.*;
 
 /**********************************************************
- * DataNode controls just one critical table:
- *   block-> BLOCK_SIZE stream of bytes
+ * DataNode is a class (and program) that stores a set of
+ * blocks for a DFS deployment.  A single deployment can
+ * have one or many DataNodes.  Each DataNode communicates
+ * regularly with a single NameNode.  It also communicates
+ * with client code and other DataNodes from time to time.
  *
- * This info is stored on disk (the NameNode is responsible for
- * asking other machines to replicate the data).  The DataNode
+ * DataNodes store a series of named blocks.  The DataNode
+ * allows client code to read these blocks, or to write new
+ * block data.  The DataNode may also, in response to instructions
+ * from its NameNode, delete blocks or copy blocks to/from other
+ * DataNodes.
+ *
+ * The DataNode maintains just one critical table:
+ *   block-> stream of bytes (of BLOCK_SIZE or less)
+ *
+ * This info is stored on a local disk.  The DataNode
  * reports the table's contents to the NameNode upon startup
  * and every so often afterwards.
  *
+ * DataNodes spend their lives in an endless loop of asking
+ * the NameNode for something to do.  A NameNode cannot connect
+ * to a DataNode directly; a NameNode simply returns values from
+ * functions invoked by a DataNode.
+ *
+ * DataNodes maintain an open server socket so that client code 
+ * or other DataNodes can read/write data.  The host/port for
+ * this server is reported to the NameNode, which then sends that
+ * information to clients or other DataNodes that might be interested.
+ *
  * @author Mike Cafarella
  **********************************************************/
 public class DataNode implements FSConstants, Runnable {
@@ -73,7 +94,8 @@
     private Configuration fConf;
 
     /**
-     * Create given a configuration and a dataDir.
+     * Create the DataNode given a configuration and a dataDir.
+     * 'dataDir' is where the blocks are stored.
      */
     public DataNode(Configuration conf, String datadir) throws IOException {
         this(InetAddress.getLocalHost().getHostName(), 
@@ -82,7 +104,8 @@
     }
 
     /**
-     * Needs a directory to find its data (and config info)
+     * A DataNode can also be created with configuration information
+     * explicitly given.
      */
     public DataNode(String machineName, File datadir, InetSocketAddress nameNodeAddr, Configuration conf) throws IOException {
         this.namenode = (DatanodeProtocol) RPC.getProxy(DatanodeProtocol.class, nameNodeAddr, conf);
@@ -112,6 +135,7 @@
     }
 
     /**
+     * Return the namenode's identifier
      */
     public String getNamenode() {
         //return namenode.toString();
@@ -132,7 +156,8 @@
     }
 
     /**
-     * Main loop for the DataNode.  Runs until shutdown.
+     * Main loop for the DataNode.  Runs until shutdown,
+     * forever calling remote NameNode functions.
      */
     public void offerService() throws Exception {
         long wakeups = 0;
@@ -243,7 +268,10 @@
     }
 
     /**
-     * Server used for receiving/sending a block of data
+     * Server used for receiving/sending a block of data.
+     * This is created to listen for requests from clients or 
+     * other DataNodes.  This small server does not use the 
+     * Hadoop IPC mechanism.
      */
     class DataXceiveServer implements Runnable {
         boolean shouldListen = true;
@@ -285,6 +313,7 @@
         }
 
         /**
+         * Read/write data from/to the DataXceiveServer.
          */
         public void run() {
             try {
@@ -582,7 +611,8 @@
     }
 
     /**
-     * Used for transferring a block of data
+     * Used for transferring a block of data.  This class
+     * sends a piece of data to another DataNode.
      */
     class DataTransfer implements Runnable {
         InetSocketAddress curTarget;
@@ -744,7 +774,7 @@
         "}";
   }
 
-  /**
+    /**
      */
     public static void main(String args[]) throws IOException {
         LogFormatter.setShowThreadIDs(true);

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeInfo.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeInfo.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeInfo.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeInfo.java Sun Feb 12 23:25:35 2006
@@ -21,7 +21,9 @@
 import java.util.*;
 
 /**************************************************
- * DatanodeInfo tracks stats on a given node
+ * DatanodeInfo tracks stats on a given DataNode,
+ * such as available storage capacity, last update
+ * time, etc.
  *
  * @author Mike Cafarella
  **************************************************/

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeProtocol.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeProtocol.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeProtocol.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DatanodeProtocol.java Sun Feb 12 23:25:35 2006
@@ -19,17 +19,50 @@
 import java.io.*;
 
 /**********************************************************************
- * Protocol that an DFS datanode uses to communicate with the NameNode.
- * It's used to upload current load information and block records.
+ * Protocol that a DFS datanode uses to communicate with the NameNode.
+ * It's used to upload current load information and block reports.
+ *
+ * The only way a NameNode can communicate with a DataNode is by
+ * returning values from these functions.
  *
  * @author Michael Cafarella
  **********************************************************************/
 interface DatanodeProtocol {
-
+    /**
+     * sendHeartbeat() tells the NameNode that the DataNode is still
+     * alive and well.  Includes some status info, too.
+     */
     public void sendHeartbeat(String sender, long capacity, long remaining) throws IOException;
+
+    /**
+     * blockReport() tells the NameNode about all the locally-stored blocks.
+     * The NameNode returns an array of Blocks that have become obsolete
+     * and should be deleted.  This function is meant to upload *all*
+     * the locally-stored blocks.  It's invoked upon startup and then
+     * infrequently afterwards.
+     */
     public Block[] blockReport(String sender, Block blocks[]) throws IOException;
+    
+    /**
+     * blockReceived() allows the DataNode to tell the NameNode about
+     * recently-received block data.  For example, whenever client code
+     * writes a new Block here, or another DataNode copies a Block to
+     * this DataNode, it will call blockReceived().
+     */
     public void blockReceived(String sender, Block blocks[]) throws IOException;
+
+    /**
+     * errorReport() tells the NameNode about something that has gone
+     * awry.  Useful for debugging.
+     */
     public void errorReport(String sender, String msg) throws IOException;
 
+    /**
+     * The DataNode periodically calls getBlockwork().  It includes a
+     * small amount of status information, but mainly gives the NameNode
+     * a chance to return a "BlockCommand" object.  A BlockCommand tells
+     * the DataNode to invalidate local block(s), or to copy them to other 
+     * DataNodes, etc.
+     */
     public BlockCommand getBlockwork(String sender, int xmitsInProgress) throws IOException;
 }

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DistributedFileSystem.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DistributedFileSystem.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DistributedFileSystem.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/DistributedFileSystem.java Sun Feb 12 23:25:35 2006
@@ -26,8 +26,12 @@
 
 /****************************************************************
  * Implementation of the abstract FileSystem for the DFS system.
- * This is the distributed file system.  It can be distributed over
- * 1 or more machines 
+ * This object is the way end-user code interacts with a Hadoop
+ * DistributedFileSystem.
+ *
+ * It's substantially a wrapper around the DFSClient class, with
+ * a few extra functions.
+ *
  * @author Mike Cafarella
  *****************************************************************/
 public class DistributedFileSystem extends FileSystem {

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Sun Feb 12 23:25:35 2006
@@ -24,7 +24,10 @@
 import java.util.logging.*;
 
 /***************************************************
- * The FSNamesystem tracks several important tables.
+ * FSNamesystem does the actual bookkeeping work for the
+ * DataNode.
+ *
+ * It tracks several important tables.
  *
  * 1)  valid fsname --> blocklist  (kept on disk, logged)
  * 2)  Set of all valid blocks (inverted #1)

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/NameNode.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/NameNode.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/NameNode.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/NameNode.java Sun Feb 12 23:25:35 2006
@@ -24,13 +24,34 @@
 import java.util.logging.*;
 
 /**********************************************************
- * NameNode controls two critical tables:
- *   1)  filename->blocksequence,version
- *   2)  block->machinelist
+ * NameNode serves as both directory namespace manager and
+ * "inode table" for the Hadoop DFS.  There is a single NameNode
+ * running in any DFS deployment.  (Well, except when there
+ * is a second backup/failover NameNode.)
+ *
+ * The NameNode controls two critical tables:
+ *   1)  filename->blocksequence (namespace)
+ *   2)  block->machinelist ("inodes")
  *
  * The first table is stored on disk and is very precious.
  * The second table is rebuilt every time the NameNode comes
  * up.
+ *
+ * 'NameNode' refers to both this class as well as the 'NameNode server'.
+ * The 'FSNamesystem' class actually performs most of the filesystem
+ * management.  The majority of the 'NameNode' class itself is concerned
+ * with exposing the IPC interface to the outside world, plus some
+ * configuration management.
+ *
+ * NameNode implements the ClientProtocol interface, which allows
+ * clients to ask for DFS services.  ClientProtocol is not
+ * designed for direct use by authors of DFS client code.  End-users
+ * should instead use the org.apache.nutch.hadoop.fs.FileSystem class.
+ *
+ * NameNode also implements the DatanodeProtocol interface, used by
+ * DataNode programs that actually store DFS data blocks.  These
+ * methods are invoked repeatedly and automatically by all the
+ * DataNodes in a DFS deployment.
  *
  * @author Mike Cafarella
  **********************************************************/

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/package.html
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/package.html?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/package.html (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/package.html Sun Feb 12 23:25:35 2006
@@ -5,5 +5,12 @@
 org.apache.hadoop.fs.FileSystem}.  This is loosely modelled after
 Google's <a href="http://labs.google.com/papers/gfs.html">GFS</a>.</p>
 
+<p>The most important difference is that unlike GFS, Hadoop DFS files 
+have strictly one writer at any one time.  Bytes are always appended 
+to the end of the writer's stream.  There is no notion of "record appends"
+or "mutations" that are then checked or reordered.  Writers simply emit 
+a byte stream.  That byte stream is guaranteed to be stored in the 
+order written.</p>
+
 </body>
 </html>

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/fs/FileSystem.java
URL: http://svn.apache.org/viewcvs/lucene/hadoop/trunk/src/java/org/apache/hadoop/fs/FileSystem.java?rev=377317&r1=377316&r2=377317&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/fs/FileSystem.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/fs/FileSystem.java Sun Feb 12 23:25:35 2006
@@ -25,18 +25,19 @@
 import org.apache.hadoop.util.LogFormatter;
 
 /****************************************************************
- * An abstract base class for a fairly simple
- * distributed file system.
- * A Hadoop installation might consist
- * of multiple machines, which should swap files transparently.
- * This interface allows other Hadoop systems to find and place
- * files into the distributed Hadoop-controlled file world.
+ * An abstract base class for a fairly generic filesystem.  It
+ * may be implemented as a distributed filesystem, or as a "local"
+ * one that reflects the locally-connected disk.  The local version
+ * exists for small Hadopp instances and for testing.
+ *
  * <p>
- * A local implementation exists for testing and for small Hadoop instances.
- * <p>
- * The standard job of FileSystem is to take the location-
- * independent HadoopFile objects, and resolve them using local
- * knowledge and local instances of ShareGroup.
+ *
+ * All user code that may potentially use the Hadoop Distributed
+ * File System should be written to use a FileSystem object.  The
+ * Hadoop DFS is a multi-machine system that appears as a single
+ * disk.  It's useful because of its fault tolerance and potentially
+ * very large capacity.
+ * 
  * <p>
  * The local implementation is {@link LocalFileSystem} and distributed
  * implementation is {@link DistributedFileSystem}.