You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mahout.apache.org by "Karl Wettin (JIRA)" <ji...@apache.org> on 2008/04/19 08:44:23 UTC

[jira] Updated: (MAHOUT-49) ParameterEnumerable

     [ https://issues.apache.org/jira/browse/MAHOUT-49?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Karl Wettin updated MAHOUT-49:
------------------------------

    Attachment: MAHOUT-49.txt


 * Parametered
 * Parameter 
 * ParameterUtils - prints settings to different output formats
 * AbstractParameter 
 * CompositeParameter - a class of some sort that can contain more parameters.
 * PathParameter
 * DoubleParameter, IntegerParameter, StringParameter

This is what implementations can look like:

{code:java}
public class HierarchialClusterDriver implements Tool, Parametered {

  private void help() {
    System.out.println("Usage: ./hadoop mahout.job HierarchialClusterDriver [default root path]");
    System.out.println();
    System.out.println(ParameterUtils.help(this));
  }

  public static void main(String[] args) throws Exception {
    HierarchialClusterDriver driver = new HierarchialClusterDriver();
    driver.configureParameters(HierarchialClusterDriver.class.getSimpleName() + ".");
    int res = ToolRunner.run(new Configuration(), driver, args);
    System.exit(res);

  }

  private List<Parameter> parameters;
  private Parameter<Path> defaultRootPath;
  private Parameter<Path> trainingInstancesPath;
  private Parameter<Path> trainingInstancePath;
  private Parameter<Path> treePath;
  private Parameter<Path> treeInstancesPath;
  private Parameter<Double> instanceDistancePruneThreadshold;
  private Parameter<DistanceMeasure> distanceMeasure;
  private Parameter<Path> closestInstanceOutputPath;
  private Parameter<Path> closestNodeOutputPath;
  private Parameter<Path> nodesOutputFile;
  protected Parameter<String> trainingInstanceVectorWritableClass;

  public List<Parameter> getParameters() {
    return parameters;
  }

  public void configureParameters(String prefix) {
    parameters = new ArrayList<Parameter>(1);
    parameters.add(defaultRootPath = new PathParameter(new Path("/" + HierarchialClusterDriver.class.getSimpleName()), prefix + "defaultRootPath", "Path to where all files are located by default."));

    parameters.add(trainingInstancesPath = new PathParameter(new Path(defaultRootPath.get(), "trainingInstances"), prefix + "trainingInstancesPath", "Path to file containing instances to be added to the tree."));
    parameters.add(trainingInstancePath = new PathParameter(new Path(defaultRootPath.get(), "trainingInstance"), prefix + "trainingInstancePath", "Path to temporay file containing the instance to be inserted and currently compared against instances in the tree."));
    parameters.add(trainingInstanceVectorWritableClass = new StringParameter(DenseVector.class.getName(), prefix + "trainingInstanceVectorWritableClass", "VectorWritable class used to read and write trainingInstances and trainingInstance file."));

    parameters.add(distanceMeasure = new CompositeParameter<DistanceMeasure>(DistanceMeasure.class, new WeightedEuclideanDistanceMeasure(), prefix + "distanceMeasure", "Distance measure used to calcualte distance between instances."));
    parameters.add(treePath = new PathParameter(new Path(defaultRootPath.get(), "tree"), prefix + "treePath", "Path to directory containing persistent tree."));
    parameters.add(treeInstancesPath = new PathParameter(new Path(treePath.get(), "instances"), prefix + "treeInstancesPath", "Path to temporay file containing all instances currently in tree."));
    parameters.add(instanceDistancePruneThreadshold = new DoubleParameter(0.2d, prefix + "instanceDistancePruneThreadshold", "Instances will share the same leaf node if the distance between them is no more than this value."));

    parameters.add(closestInstanceOutputPath = new PathParameter(new Path(defaultRootPath.get(), "closest_instance"), prefix + "closestInstanceOutputPath", "Path to temporay results directoy containing closest instance in tree."));
    parameters.add(closestNodeOutputPath = new PathParameter(new Path(defaultRootPath.get(), "closest_node"), prefix + "closestNodeOutputPath", "Path to temporay results directoy containing closest node in tree.."));

    parameters.add(nodesOutputFile = new PathParameter(new Path(defaultRootPath.get(), "nodes_from_leaf_to_root"), prefix + "nodesOutputFile", "Path to temporay file containing nodes between instance and root to be compared against training instance."));

    for (Parameter parameter : parameters) {
      parameter.configureParameters(parameter.name() + ".");
    }
  }
{code}

Notice that the driver has a Parameter<DistanceMeasure>.

{code:java}
public abstract class WeightedDistanceMeasure extends AbstractDistanceMeasure {
  protected List<Parameter> parameters;
  protected Parameter<String> weightsFile;
  protected Parameter<String> vectorWritableClass;
  protected Vector weights;


  public void configureParameters(String prefix) {
    parameters = new ArrayList<Parameter>(2);
    parameters.add(weightsFile = new StringParameter(null, prefix + "weightsFile", "Path on DFS to a file containing the weights."));
    parameters.add(vectorWritableClass = new StringParameter(DenseVector.class.getName(), prefix + "vectorWritableClass", "VectorWritable class used to read file specified in parameter weightsFile."));
  }

  public Collection<Parameter> getParameters() {
    return parameters;
  }

  public void configure(JobConf jobConf) {
    if (parameters == null) {
      configureParameters(WeightedDistanceMeasure.class.getName() + ".");
    }
    try {
      FileSystem fs = FileSystem.get(jobConf);
      if (weightsFile.get() != null) {
        VectorWritable writable = (VectorWritable) Class.forName(vectorWritableClass.get()).newInstance();
{code}

Here is the output from ParameterUtil.help(driver):

{noformat}
Usage: ./hadoop mahout.job HierarchialClusterDriver [root path]

HierarchialClusterDriver.defaultRootPath                        Path to where all files are located by default. (default value '/HierarchialClusterDriver')
HierarchialClusterDriver.trainingInstancesPath                  Path to file containing instances to be added to the tree. (default value '/HierarchialClusterDriver/trainingInstances')
HierarchialClusterDriver.trainingInstancePath                   Path to temporay file containing the instance to be inserted and currently compared against instances in the tree. (default value '/HierarchialClusterDriver/trainingInstance')
HierarchialClusterDriver.trainingInstanceVectorWritableClass    VectorWritable class used to read and write trainingInstances and trainingInstance file. (default value 'org.apache.mahout.matrix.DenseVector')
HierarchialClusterDriver.distanceMeasure                        Distance measure used to calcualte distance between instances. (default value 'org.apache.mahout.utils.WeightedEuclideanDistanceMeasure')
{noformat}
the next two lines are the composite parts of the parameter in previous line (where there is a todo with default value that should be a string and not an object).
{noformat}
HierarchialClusterDriver.distanceMeasure.weightsFile            Path on DFS to a file containing the weights.
HierarchialClusterDriver.distanceMeasure.vectorWritableClass    VectorWritable class used to read file specified in parameter weightsFile. (default value 'org.apache.mahout.matrix.DenseVector')
HierarchialClusterDriver.treePath                               Path to directory containing persistent tree. (default value '/HierarchialClusterDriver/tree')
HierarchialClusterDriver.treeInstancesPath                      Path to temporay file containing all instances currently in tree. (default value '/HierarchialClusterDriver/tree/instances')
HierarchialClusterDriver.instanceDistancePruneThreadshold       Instances will share the same leaf node if the distance between them is no more than this value. (default value '0.2')
HierarchialClusterDriver.closestInstanceOutputPath              Path to temporay results directoy containing closest instance in tree. (default value '/HierarchialClusterDriver/closest_instance')
HierarchialClusterDriver.closestNodeOutputPath                  Path to temporay results directoy containing closest node in tree.. (default value '/HierarchialClusterDriver/closest_node')
HierarchialClusterDriver.nodesOutputFile                        Path to temporay file containing nodes between instance and root to be compared against training instance. (default value '/HierarchialClusterDriver/nodes_from_leaf_to_root')
{noformat}

And this is the output from ParameterUtil.conf(driver):

{noformat}
# Path to where all files are located by default.
HierarchialClusterDriver.defaultRootPath = /HierarchialClusterDriver

# Path to file containing instances to be added to the tree.
HierarchialClusterDriver.trainingInstancesPath = /HierarchialClusterDriver/trainingInstances

# Path to temporay file containing the instance to be inserted and currently compared against instances in the tree.
HierarchialClusterDriver.trainingInstancePath = /HierarchialClusterDriver/trainingInstance

# VectorWritable class used to read and write trainingInstances and trainingInstance file.
HierarchialClusterDriver.trainingInstanceVectorWritableClass = org.apache.mahout.matrix.DenseVector

# Distance measure used to calcualte distance between instances.
HierarchialClusterDriver.distanceMeasure = org.apache.mahout.utils.WeightedEuclideanDistanceMeasure

# Path on DFS to a file containing the weights.
HierarchialClusterDriver.distanceMeasure.weightsFile = 

# VectorWritable class used to read file specified in parameter weightsFile.
HierarchialClusterDriver.distanceMeasure.vectorWritableClass = org.apache.mahout.matrix.DenseVector

# Path to directory containing persistent tree.
HierarchialClusterDriver.treePath = /HierarchialClusterDriver/tree

# Path to temporay file containing all instances currently in tree.
HierarchialClusterDriver.treeInstancesPath = /HierarchialClusterDriver/tree/instances

# Instances will share the same leaf node if the distance between them is no more than this value.
HierarchialClusterDriver.instanceDistancePruneThreadshold = 0.2

# Path to temporay results directoy containing closest instance in tree.
HierarchialClusterDriver.closestInstanceOutputPath = /HierarchialClusterDriver/closest_instance

# Path to temporay results directoy containing closest node in tree..
HierarchialClusterDriver.closestNodeOutputPath = /HierarchialClusterDriver/closest_node

# Path to temporay file containing nodes between instance and root to be compared against training instance.
HierarchialClusterDriver.nodesOutputFile = /HierarchialClusterDriver/nodes_from_leaf_to_root
{noformat}

> ParameterEnumerable
> -------------------
>
>                 Key: MAHOUT-49
>                 URL: https://issues.apache.org/jira/browse/MAHOUT-49
>             Project: Mahout
>          Issue Type: New Feature
>            Reporter: Karl Wettin
>            Assignee: Karl Wettin
>         Attachments: MAHOUT-49.txt
>
>
> A utility package used to 
>  * configure class
>  * create default configuration files
>  * parse main method arguments
>  * produce human readable help
>  * getters and setters for value as object and string, for future generic reflection based GUI.

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.