You are viewing a plain text version of this content. The canonical link for it is here.

Posted to common-user@hadoop.apache.org by Cyril Bogus <cy...@gmail.com> on 2013/03/29 16:54:55 UTC

FileSystem Error

Hi,

I am running a small java program that basically write a small input data
to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
then output the content of the data.

In my hadoop.properties I have included the core-site.xml definition for
the Java program to connect to my single node setup so that I will not use
the Java Project file system but hadoop instead (Basically all write and
read are done on hadoop and not in the class file).

When I run the program, as soon as the Canopy (even the KMeans),
configuration tries to lookup for the file in the class path instead of the
Hadoop FileSystem path where the proper files are located.

Is there a problem with the way I have my conf defined?

hadoop.properties:
fs.default.name=hdfs//mylocation

Program:

public class DataFileWriter {

    private static Properties props = new Properties();
    private static Configuration conf = new Configuration();

    /**
     * @param args
     * @throws ClassNotFoundException
     * @throws InterruptedException
     * @throws IOException
     */
    public static void main(String[] args) throws IOException,
            InterruptedException, ClassNotFoundException {

        props.load(new FileReader(new File(
                "/home/cyril/workspace/Newer/src/hadoop.properties")));

        // TODO Auto-generated method stub
        FileSystem fs = null;
        SequenceFile.Writer writer;
        SequenceFile.Reader reader;

        conf.set("fs.default.name", props.getProperty("fs.default.name"));

        List<NamedVector> vectors = new LinkedList<NamedVector>();
        NamedVector v1 = new NamedVector(new DenseVector(new double[] { 0.1,
                0.2, 0.5 }), "Hello");
        vectors.add(v1);
        v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
}),
                "Bored");
        vectors.add(v1);
        v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
}),
                "Done");
        vectors.add(v1);
        // Write the data to SequenceFile
        try {
            fs = FileSystem.get(conf);

            Path path = new Path("testdata_seq/data");
            writer = new SequenceFile.Writer(fs, conf, path, Text.class,
                    VectorWritable.class);

            VectorWritable vec = new VectorWritable();
            for (NamedVector vector : vectors) {
                vec.set(vector);
                writer.append(new Text(vector.getName()), vec);
            }
            writer.close();

        } catch (Exception e) {
            System.out.println("ERROR: " + e);
        }

        Path input = new Path("testdata_seq/data");
        boolean runSequential = false;
        Path clustersOut = new Path("testdata_seq/clusters");
        Path clustersIn = new
Path("testdata_seq/clusters/clusters-0-final");
        double convergenceDelta = 0;
        double clusterClassificationThreshold = 0;
        boolean runClustering = true;
        Path output = new Path("testdata_seq/output");
        int maxIterations = 12;
        CanopyDriver.run(conf, input, clustersOut, new
EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
clusterClassificationThreshold, runSequential);
        KMeansDriver.run(conf, input, clustersIn, output, new
EuclideanDistanceMeasure(), convergenceDelta, maxIterations, runClustering,
clusterClassificationThreshold, runSequential);

        reader = new SequenceFile.Reader(fs,
                new Path("testdata_seq/clusteredPoints/part-m-00000"),
conf);

        IntWritable key = new IntWritable();
        WeightedVectorWritable value = new WeightedVectorWritable();
        while (reader.next(key, value)) {
          System.out.println(value.toString() + " belongs to cluster "
                             + key.toString());
        }
    }

}

Error Output:

.......
13/03/29 11:47:15 ERROR security.UserGroupInformation:
PriviledgedActionException as:cyril
cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
Exception in thread "main"
org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
    at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
    at
org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
    at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
    at org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
    at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
    at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
    at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
    at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:416)
    at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
    at
org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
    at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
    at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
    at
org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
    at
org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
    at
org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
    at
org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
    at DataFileWriter.main(DataFileWriter.java:85)




On another note. Is there a command that would allow the program to
overwrite existing files in the filesystem (I would get errors if I don't
delete the files before running the program again).

Thank you for a reply and I hope I have given all the necessary output. In
the meantime I will look into it.

Cyril

Re: FileSystem Error

Posted by Chris Harrington <ch...@heystaks.com>.

On 29 Mar 2013, at 17:05, Cyril Bogus wrote:

> Thank you again Chris.
> 
> Yes it is a typo.
> 
> After careful reading of the output, my program is exactly doing what you
> describe.
> I am trying to do everything in Hadoop fs but it is creating files on both
> hadoop fs and class fs and some files are missing. When I run AND copy the
> missing file from hadoop fs into the class file I get the proper output(no
> errors). And I also get the proper output when I do everything within the
> class file (by removing the property of conf).
> 
> But I am trying to automate everything to run on my three node cluster for
> testing within java. So I need to be able to do everything on Hadoop fs. I
> will look into setting up Mahout for a proper *conf *file.
> 
> - Cyril
> 
> 
> On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <ch...@heystaks.com>wrote:
> 
>> Well then do all the various folders exist on the hadoop fs?
>> 
>> I also had a similar problem awhile ago where my program ran fine but then
>> I did something (no idea what) and hadoop started complaining. To fix it I
>> had to put everything on the hadoop fs. i.e. was move all <local fs path
>> to>/data to <hadoop fs path to>data
>> 
>> One more strange issue I ran into was where I had identically named
>> folders on both local and hdfs and it was looking in the wrong one.
>> 
>> I think that's all the causes I've run into, so if they're not the cause
>> then I'm out of ideas and hopefully someone else will be able to help.
>> 
>> also the missing colon is a typo right? hdfs//mylocation
>> 
>> On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
>> 
>>> Thank you for the reply Chris,
>>> 
>>> I create and write fine on the file system. And the file is there when I
>>> check hadoop. So I do not think the problem is privileges. As I read it,
>>> the Canopy Driver is looking for the file under the Class file
>>> (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
>>> (/user/cyrille/) and the file is not there so it gives me the error that
>>> the file does not exists. But the file exists and was created fine
>> "within
>>> the program with the same conf variable"
>>> 
>>> - Cyril
>>> 
>>> 
>>> On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <chris@heystaks.com
>>> wrote:
>>> 
>>>>> security.UserGroupInformation:
>>>>> PriviledgedActionException as:cyril
>>>> 
>>>> I'm not entirely sure but sounds like a permissions issue to me. check
>> all
>>>> the files are owned by the user cyril and not root.
>>>> also did you start hadoop as root and run the program as cyril, hadoop
>>>> might also complain about that
>>>> 
>>>> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
>>>> 
>>>>> Hi,
>>>>> 
>>>>> I am running a small java program that basically write a small input
>> data
>>>>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
>>>>> then output the content of the data.
>>>>> 
>>>>> In my hadoop.properties I have included the core-site.xml definition
>> for
>>>>> the Java program to connect to my single node setup so that I will not
>>>> use
>>>>> the Java Project file system but hadoop instead (Basically all write
>> and
>>>>> read are done on hadoop and not in the class file).
>>>>> 
>>>>> When I run the program, as soon as the Canopy (even the KMeans),
>>>>> configuration tries to lookup for the file in the class path instead of
>>>> the
>>>>> Hadoop FileSystem path where the proper files are located.
>>>>> 
>>>>> Is there a problem with the way I have my conf defined?
>>>>> 
>>>>> hadoop.properties:
>>>>> fs.default.name=hdfs//mylocation
>>>>> 
>>>>> Program:
>>>>> 
>>>>> public class DataFileWriter {
>>>>> 
>>>>>  private static Properties props = new Properties();
>>>>>  private static Configuration conf = new Configuration();
>>>>> 
>>>>>  /**
>>>>>   * @param args
>>>>>   * @throws ClassNotFoundException
>>>>>   * @throws InterruptedException
>>>>>   * @throws IOException
>>>>>   */
>>>>>  public static void main(String[] args) throws IOException,
>>>>>          InterruptedException, ClassNotFoundException {
>>>>> 
>>>>>      props.load(new FileReader(new File(
>>>>>              "/home/cyril/workspace/Newer/src/hadoop.properties")));
>>>>> 
>>>>>      // TODO Auto-generated method stub
>>>>>      FileSystem fs = null;
>>>>>      SequenceFile.Writer writer;
>>>>>      SequenceFile.Reader reader;
>>>>> 
>>>>>      conf.set("fs.default.name", props.getProperty("fs.default.name
>>>> "));
>>>>> 
>>>>>      List<NamedVector> vectors = new LinkedList<NamedVector>();
>>>>>      NamedVector v1 = new NamedVector(new DenseVector(new double[] {
>>>> 0.1,
>>>>>              0.2, 0.5 }), "Hello");
>>>>>      vectors.add(v1);
>>>>>      v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
>>>>> }),
>>>>>              "Bored");
>>>>>      vectors.add(v1);
>>>>>      v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
>>>>> }),
>>>>>              "Done");
>>>>>      vectors.add(v1);
>>>>>      // Write the data to SequenceFile
>>>>>      try {
>>>>>          fs = FileSystem.get(conf);
>>>>> 
>>>>>          Path path = new Path("testdata_seq/data");
>>>>>          writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>>>>>                  VectorWritable.class);
>>>>> 
>>>>>          VectorWritable vec = new VectorWritable();
>>>>>          for (NamedVector vector : vectors) {
>>>>>              vec.set(vector);
>>>>>              writer.append(new Text(vector.getName()), vec);
>>>>>          }
>>>>>          writer.close();
>>>>> 
>>>>>      } catch (Exception e) {
>>>>>          System.out.println("ERROR: " + e);
>>>>>      }
>>>>> 
>>>>>      Path input = new Path("testdata_seq/data");
>>>>>      boolean runSequential = false;
>>>>>      Path clustersOut = new Path("testdata_seq/clusters");
>>>>>      Path clustersIn = new
>>>>> Path("testdata_seq/clusters/clusters-0-final");
>>>>>      double convergenceDelta = 0;
>>>>>      double clusterClassificationThreshold = 0;
>>>>>      boolean runClustering = true;
>>>>>      Path output = new Path("testdata_seq/output");
>>>>>      int maxIterations = 12;
>>>>>      CanopyDriver.run(conf, input, clustersOut, new
>>>>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
>>>>> clusterClassificationThreshold, runSequential);
>>>>>      KMeansDriver.run(conf, input, clustersIn, output, new
>>>>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
>>>> runClustering,
>>>>> clusterClassificationThreshold, runSequential);
>>>>> 
>>>>>      reader = new SequenceFile.Reader(fs,
>>>>>              new Path("testdata_seq/clusteredPoints/part-m-00000"),
>>>>> conf);
>>>>> 
>>>>>      IntWritable key = new IntWritable();
>>>>>      WeightedVectorWritable value = new WeightedVectorWritable();
>>>>>      while (reader.next(key, value)) {
>>>>>        System.out.println(value.toString() + " belongs to cluster "
>>>>>                           + key.toString());
>>>>>      }
>>>>>  }
>>>>> 
>>>>> }
>>>>> 
>>>>> Error Output:
>>>>> 
>>>>> .......
>>>>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
>>>>> PriviledgedActionException as:cyril
>>>>> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
>> Input
>>>>> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>>>>> Exception in thread "main"
>>>>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
>>>>> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>>>>>  at
>>>> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>>>>>  at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>>>>>  at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>>>>>  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>>>>>  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>>>>>  at java.security.AccessController.doPrivileged(Native Method)
>>>>>  at javax.security.auth.Subject.doAs(Subject.java:416)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>>>>>  at
>>>>> 
>> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>>>>>  at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>>>>>  at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>>>>>  at DataFileWriter.main(DataFileWriter.java:85)
>>>>> 
>>>>> 
>>>>> 
>>>>> 
>>>>> On another note. Is there a command that would allow the program to
>>>>> overwrite existing files in the filesystem (I would get errors if I
>> don't
>>>>> delete the files before running the program again).
>>>>> 
>>>>> Thank you for a reply and I hope I have given all the necessary output.
>>>> In
>>>>> the meantime I will look into it.
>>>>> 
>>>>> Cyril
>>>> 
>>>> 
>> 
>>

Re: FileSystem Error

Posted by Chris Harrington <ch...@heystaks.com>.

whoops sorry about the empty mail last time,

I have one last suggestion though I'm not sure it'll work

you could try putting the path names as hdfs://testdata_seq/clusters

apart from that I'm out of ideas

On 29 Mar 2013, at 17:05, Cyril Bogus wrote:

> Thank you again Chris.
> 
> Yes it is a typo.
> 
> After careful reading of the output, my program is exactly doing what you
> describe.
> I am trying to do everything in Hadoop fs but it is creating files on both
> hadoop fs and class fs and some files are missing. When I run AND copy the
> missing file from hadoop fs into the class file I get the proper output(no
> errors). And I also get the proper output when I do everything within the
> class file (by removing the property of conf).
> 
> But I am trying to automate everything to run on my three node cluster for
> testing within java. So I need to be able to do everything on Hadoop fs. I
> will look into setting up Mahout for a proper *conf *file.
> 
> - Cyril
> 
> 
> On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <ch...@heystaks.com>wrote:
> 
>> Well then do all the various folders exist on the hadoop fs?
>> 
>> I also had a similar problem awhile ago where my program ran fine but then
>> I did something (no idea what) and hadoop started complaining. To fix it I
>> had to put everything on the hadoop fs. i.e. was move all <local fs path
>> to>/data to <hadoop fs path to>data
>> 
>> One more strange issue I ran into was where I had identically named
>> folders on both local and hdfs and it was looking in the wrong one.
>> 
>> I think that's all the causes I've run into, so if they're not the cause
>> then I'm out of ideas and hopefully someone else will be able to help.
>> 
>> also the missing colon is a typo right? hdfs//mylocation
>> 
>> On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
>> 
>>> Thank you for the reply Chris,
>>> 
>>> I create and write fine on the file system. And the file is there when I
>>> check hadoop. So I do not think the problem is privileges. As I read it,
>>> the Canopy Driver is looking for the file under the Class file
>>> (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
>>> (/user/cyrille/) and the file is not there so it gives me the error that
>>> the file does not exists. But the file exists and was created fine
>> "within
>>> the program with the same conf variable"
>>> 
>>> - Cyril
>>> 
>>> 
>>> On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <chris@heystaks.com
>>> wrote:
>>> 
>>>>> security.UserGroupInformation:
>>>>> PriviledgedActionException as:cyril
>>>> 
>>>> I'm not entirely sure but sounds like a permissions issue to me. check
>> all
>>>> the files are owned by the user cyril and not root.
>>>> also did you start hadoop as root and run the program as cyril, hadoop
>>>> might also complain about that
>>>> 
>>>> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
>>>> 
>>>>> Hi,
>>>>> 
>>>>> I am running a small java program that basically write a small input
>> data
>>>>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
>>>>> then output the content of the data.
>>>>> 
>>>>> In my hadoop.properties I have included the core-site.xml definition
>> for
>>>>> the Java program to connect to my single node setup so that I will not
>>>> use
>>>>> the Java Project file system but hadoop instead (Basically all write
>> and
>>>>> read are done on hadoop and not in the class file).
>>>>> 
>>>>> When I run the program, as soon as the Canopy (even the KMeans),
>>>>> configuration tries to lookup for the file in the class path instead of
>>>> the
>>>>> Hadoop FileSystem path where the proper files are located.
>>>>> 
>>>>> Is there a problem with the way I have my conf defined?
>>>>> 
>>>>> hadoop.properties:
>>>>> fs.default.name=hdfs//mylocation
>>>>> 
>>>>> Program:
>>>>> 
>>>>> public class DataFileWriter {
>>>>> 
>>>>>  private static Properties props = new Properties();
>>>>>  private static Configuration conf = new Configuration();
>>>>> 
>>>>>  /**
>>>>>   * @param args
>>>>>   * @throws ClassNotFoundException
>>>>>   * @throws InterruptedException
>>>>>   * @throws IOException
>>>>>   */
>>>>>  public static void main(String[] args) throws IOException,
>>>>>          InterruptedException, ClassNotFoundException {
>>>>> 
>>>>>      props.load(new FileReader(new File(
>>>>>              "/home/cyril/workspace/Newer/src/hadoop.properties")));
>>>>> 
>>>>>      // TODO Auto-generated method stub
>>>>>      FileSystem fs = null;
>>>>>      SequenceFile.Writer writer;
>>>>>      SequenceFile.Reader reader;
>>>>> 
>>>>>      conf.set("fs.default.name", props.getProperty("fs.default.name
>>>> "));
>>>>> 
>>>>>      List<NamedVector> vectors = new LinkedList<NamedVector>();
>>>>>      NamedVector v1 = new NamedVector(new DenseVector(new double[] {
>>>> 0.1,
>>>>>              0.2, 0.5 }), "Hello");
>>>>>      vectors.add(v1);
>>>>>      v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
>>>>> }),
>>>>>              "Bored");
>>>>>      vectors.add(v1);
>>>>>      v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
>>>>> }),
>>>>>              "Done");
>>>>>      vectors.add(v1);
>>>>>      // Write the data to SequenceFile
>>>>>      try {
>>>>>          fs = FileSystem.get(conf);
>>>>> 
>>>>>          Path path = new Path("testdata_seq/data");
>>>>>          writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>>>>>                  VectorWritable.class);
>>>>> 
>>>>>          VectorWritable vec = new VectorWritable();
>>>>>          for (NamedVector vector : vectors) {
>>>>>              vec.set(vector);
>>>>>              writer.append(new Text(vector.getName()), vec);
>>>>>          }
>>>>>          writer.close();
>>>>> 
>>>>>      } catch (Exception e) {
>>>>>          System.out.println("ERROR: " + e);
>>>>>      }
>>>>> 
>>>>>      Path input = new Path("testdata_seq/data");
>>>>>      boolean runSequential = false;
>>>>>      Path clustersOut = new Path("testdata_seq/clusters");
>>>>>      Path clustersIn = new
>>>>> Path("testdata_seq/clusters/clusters-0-final");
>>>>>      double convergenceDelta = 0;
>>>>>      double clusterClassificationThreshold = 0;
>>>>>      boolean runClustering = true;
>>>>>      Path output = new Path("testdata_seq/output");
>>>>>      int maxIterations = 12;
>>>>>      CanopyDriver.run(conf, input, clustersOut, new
>>>>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
>>>>> clusterClassificationThreshold, runSequential);
>>>>>      KMeansDriver.run(conf, input, clustersIn, output, new
>>>>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
>>>> runClustering,
>>>>> clusterClassificationThreshold, runSequential);
>>>>> 
>>>>>      reader = new SequenceFile.Reader(fs,
>>>>>              new Path("testdata_seq/clusteredPoints/part-m-00000"),
>>>>> conf);
>>>>> 
>>>>>      IntWritable key = new IntWritable();
>>>>>      WeightedVectorWritable value = new WeightedVectorWritable();
>>>>>      while (reader.next(key, value)) {
>>>>>        System.out.println(value.toString() + " belongs to cluster "
>>>>>                           + key.toString());
>>>>>      }
>>>>>  }
>>>>> 
>>>>> }
>>>>> 
>>>>> Error Output:
>>>>> 
>>>>> .......
>>>>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
>>>>> PriviledgedActionException as:cyril
>>>>> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
>> Input
>>>>> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>>>>> Exception in thread "main"
>>>>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
>>>>> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>>>>>  at
>>>> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>>>>>  at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>>>>>  at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>>>>>  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>>>>>  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>>>>>  at java.security.AccessController.doPrivileged(Native Method)
>>>>>  at javax.security.auth.Subject.doAs(Subject.java:416)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>>>>>  at
>>>>> 
>> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>>>>>  at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>>>>>  at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>>>>>  at
>>>>> 
>>>> 
>> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>>>>>  at DataFileWriter.main(DataFileWriter.java:85)
>>>>> 
>>>>> 
>>>>> 
>>>>> 
>>>>> On another note. Is there a command that would allow the program to
>>>>> overwrite existing files in the filesystem (I would get errors if I
>> don't
>>>>> delete the files before running the program again).
>>>>> 
>>>>> Thank you for a reply and I hope I have given all the necessary output.
>>>> In
>>>>> the meantime I will look into it.
>>>>> 
>>>>> Cyril
>>>> 
>>>> 
>> 
>>

Re: FileSystem Error

Posted by Dan Filimon <da...@gmail.com>.

Happy to help! :)


On Fri, Mar 29, 2013 at 9:38 PM, Cyril Bogus <cy...@gmail.com> wrote:

> THANK YOU SO MUCH DAN...
>
> It even solved another problem I was having with Sqoop who couldn't connect
> to the hdfs through Java Programming.
>
>
> On Fri, Mar 29, 2013 at 3:30 PM, Dan Filimon <dangeorge.filimon@gmail.com
> >wrote:
>
> > Maybe this helps?
> >
> >
> http://www.opensourceconnections.com/2013/03/24/hdfs-debugging-wrong-fs-expected-file-exception/
> >
> >
> > On Fri, Mar 29, 2013 at 9:27 PM, Cyril Bogus <cy...@gmail.com>
> wrote:
> >
> > > Kind of saw this coming since I felt like file:/// will be appended but
> > > here is the error I get if I do it
> > >
> > > ERROR: java.lang.IllegalArgumentException: Wrong FS:
> > > hdfs://super:54310/user/cyril/testdata_seq, expected: file:///
> > >
> > >
> > >
> > > On Fri, Mar 29, 2013 at 1:27 PM, Dan Filimon <
> > dangeorge.filimon@gmail.com
> > > >wrote:
> > >
> > > > One thing that you could try is just using _absolute paths_
> everywhere.
> > > So,
> > > > something on HDFS is hdfs://... whereas something on your local file
> > > system
> > > > is file://...
> > > >
> > > >
> > > > On Fri, Mar 29, 2013 at 7:05 PM, Cyril Bogus <cy...@gmail.com>
> > > wrote:
> > > >
> > > > > Thank you again Chris.
> > > > >
> > > > > Yes it is a typo.
> > > > >
> > > > > After careful reading of the output, my program is exactly doing
> what
> > > you
> > > > > describe.
> > > > > I am trying to do everything in Hadoop fs but it is creating files
> on
> > > > both
> > > > > hadoop fs and class fs and some files are missing. When I run AND
> > copy
> > > > the
> > > > > missing file from hadoop fs into the class file I get the proper
> > > > output(no
> > > > > errors). And I also get the proper output when I do everything
> within
> > > the
> > > > > class file (by removing the property of conf).
> > > > >
> > > > > But I am trying to automate everything to run on my three node
> > cluster
> > > > for
> > > > > testing within java. So I need to be able to do everything on
> Hadoop
> > > fs.
> > > > I
> > > > > will look into setting up Mahout for a proper *conf *file.
> > > > >
> > > > > - Cyril
> > > > >
> > > > >
> > > > > On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <
> > chris@heystaks.com
> > > > > >wrote:
> > > > >
> > > > > > Well then do all the various folders exist on the hadoop fs?
> > > > > >
> > > > > > I also had a similar problem awhile ago where my program ran fine
> > but
> > > > > then
> > > > > > I did something (no idea what) and hadoop started complaining. To
> > fix
> > > > it
> > > > > I
> > > > > > had to put everything on the hadoop fs. i.e. was move all <local
> fs
> > > > path
> > > > > > to>/data to <hadoop fs path to>data
> > > > > >
> > > > > > One more strange issue I ran into was where I had identically
> named
> > > > > > folders on both local and hdfs and it was looking in the wrong
> one.
> > > > > >
> > > > > > I think that's all the causes I've run into, so if they're not
> the
> > > > cause
> > > > > > then I'm out of ideas and hopefully someone else will be able to
> > > help.
> > > > > >
> > > > > > also the missing colon is a typo right? hdfs//mylocation
> > > > > >
> > > > > > On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
> > > > > >
> > > > > > > Thank you for the reply Chris,
> > > > > > >
> > > > > > > I create and write fine on the file system. And the file is
> there
> > > > when
> > > > > I
> > > > > > > check hadoop. So I do not think the problem is privileges. As I
> > > read
> > > > > it,
> > > > > > > the Canopy Driver is looking for the file under the Class file
> > > > > > > (/home/cyrille/DataWriter/src/testdata_seq/) instead of
> Hadoop's
> > > > > > > (/user/cyrille/) and the file is not there so it gives me the
> > error
> > > > > that
> > > > > > > the file does not exists. But the file exists and was created
> > fine
> > > > > > "within
> > > > > > > the program with the same conf variable"
> > > > > > >
> > > > > > > - Cyril
> > > > > > >
> > > > > > >
> > > > > > > On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <
> > > > chris@heystaks.com
> > > > > > >wrote:
> > > > > > >
> > > > > > >>> security.UserGroupInformation:
> > > > > > >>> PriviledgedActionException as:cyril
> > > > > > >>
> > > > > > >> I'm not entirely sure but sounds like a permissions issue to
> me.
> > > > check
> > > > > > all
> > > > > > >> the files are owned by the user cyril and not root.
> > > > > > >> also did you start hadoop as root and run the program as
> cyril,
> > > > hadoop
> > > > > > >> might also complain about that
> > > > > > >>
> > > > > > >> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
> > > > > > >>
> > > > > > >>> Hi,
> > > > > > >>>
> > > > > > >>> I am running a small java program that basically write a
> small
> > > > input
> > > > > > data
> > > > > > >>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans
> > > Clustering
> > > > > and
> > > > > > >>> then output the content of the data.
> > > > > > >>>
> > > > > > >>> In my hadoop.properties I have included the core-site.xml
> > > > definition
> > > > > > for
> > > > > > >>> the Java program to connect to my single node setup so that I
> > > will
> > > > > not
> > > > > > >> use
> > > > > > >>> the Java Project file system but hadoop instead (Basically
> all
> > > > write
> > > > > > and
> > > > > > >>> read are done on hadoop and not in the class file).
> > > > > > >>>
> > > > > > >>> When I run the program, as soon as the Canopy (even the
> > KMeans),
> > > > > > >>> configuration tries to lookup for the file in the class path
> > > > instead
> > > > > of
> > > > > > >> the
> > > > > > >>> Hadoop FileSystem path where the proper files are located.
> > > > > > >>>
> > > > > > >>> Is there a problem with the way I have my conf defined?
> > > > > > >>>
> > > > > > >>> hadoop.properties:
> > > > > > >>> fs.default.name=hdfs//mylocation
> > > > > > >>>
> > > > > > >>> Program:
> > > > > > >>>
> > > > > > >>> public class DataFileWriter {
> > > > > > >>>
> > > > > > >>>   private static Properties props = new Properties();
> > > > > > >>>   private static Configuration conf = new Configuration();
> > > > > > >>>
> > > > > > >>>   /**
> > > > > > >>>    * @param args
> > > > > > >>>    * @throws ClassNotFoundException
> > > > > > >>>    * @throws InterruptedException
> > > > > > >>>    * @throws IOException
> > > > > > >>>    */
> > > > > > >>>   public static void main(String[] args) throws IOException,
> > > > > > >>>           InterruptedException, ClassNotFoundException {
> > > > > > >>>
> > > > > > >>>       props.load(new FileReader(new File(
> > > > > > >>>
> > > > "/home/cyril/workspace/Newer/src/hadoop.properties")));
> > > > > > >>>
> > > > > > >>>       // TODO Auto-generated method stub
> > > > > > >>>       FileSystem fs = null;
> > > > > > >>>       SequenceFile.Writer writer;
> > > > > > >>>       SequenceFile.Reader reader;
> > > > > > >>>
> > > > > > >>>       conf.set("fs.default.name", props.getProperty("
> > > > fs.default.name
> > > > > > >> "));
> > > > > > >>>
> > > > > > >>>       List<NamedVector> vectors = new
> > LinkedList<NamedVector>();
> > > > > > >>>       NamedVector v1 = new NamedVector(new DenseVector(new
> > > > double[] {
> > > > > > >> 0.1,
> > > > > > >>>               0.2, 0.5 }), "Hello");
> > > > > > >>>       vectors.add(v1);
> > > > > > >>>       v1 = new NamedVector(new DenseVector(new double[] {
> 0.5,
> > > 0.1,
> > > > > 0.2
> > > > > > >>> }),
> > > > > > >>>               "Bored");
> > > > > > >>>       vectors.add(v1);
> > > > > > >>>       v1 = new NamedVector(new DenseVector(new double[] {
> 0.2,
> > > 0.5,
> > > > > 0.1
> > > > > > >>> }),
> > > > > > >>>               "Done");
> > > > > > >>>       vectors.add(v1);
> > > > > > >>>       // Write the data to SequenceFile
> > > > > > >>>       try {
> > > > > > >>>           fs = FileSystem.get(conf);
> > > > > > >>>
> > > > > > >>>           Path path = new Path("testdata_seq/data");
> > > > > > >>>           writer = new SequenceFile.Writer(fs, conf, path,
> > > > > Text.class,
> > > > > > >>>                   VectorWritable.class);
> > > > > > >>>
> > > > > > >>>           VectorWritable vec = new VectorWritable();
> > > > > > >>>           for (NamedVector vector : vectors) {
> > > > > > >>>               vec.set(vector);
> > > > > > >>>               writer.append(new Text(vector.getName()), vec);
> > > > > > >>>           }
> > > > > > >>>           writer.close();
> > > > > > >>>
> > > > > > >>>       } catch (Exception e) {
> > > > > > >>>           System.out.println("ERROR: " + e);
> > > > > > >>>       }
> > > > > > >>>
> > > > > > >>>       Path input = new Path("testdata_seq/data");
> > > > > > >>>       boolean runSequential = false;
> > > > > > >>>       Path clustersOut = new Path("testdata_seq/clusters");
> > > > > > >>>       Path clustersIn = new
> > > > > > >>> Path("testdata_seq/clusters/clusters-0-final");
> > > > > > >>>       double convergenceDelta = 0;
> > > > > > >>>       double clusterClassificationThreshold = 0;
> > > > > > >>>       boolean runClustering = true;
> > > > > > >>>       Path output = new Path("testdata_seq/output");
> > > > > > >>>       int maxIterations = 12;
> > > > > > >>>       CanopyDriver.run(conf, input, clustersOut, new
> > > > > > >>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> > > > > > >>> clusterClassificationThreshold, runSequential);
> > > > > > >>>       KMeansDriver.run(conf, input, clustersIn, output, new
> > > > > > >>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> > > > > > >> runClustering,
> > > > > > >>> clusterClassificationThreshold, runSequential);
> > > > > > >>>
> > > > > > >>>       reader = new SequenceFile.Reader(fs,
> > > > > > >>>               new
> > > > Path("testdata_seq/clusteredPoints/part-m-00000"),
> > > > > > >>> conf);
> > > > > > >>>
> > > > > > >>>       IntWritable key = new IntWritable();
> > > > > > >>>       WeightedVectorWritable value = new
> > > WeightedVectorWritable();
> > > > > > >>>       while (reader.next(key, value)) {
> > > > > > >>>         System.out.println(value.toString() + " belongs to
> > > cluster
> > > > "
> > > > > > >>>                            + key.toString());
> > > > > > >>>       }
> > > > > > >>>   }
> > > > > > >>>
> > > > > > >>> }
> > > > > > >>>
> > > > > > >>> Error Output:
> > > > > > >>>
> > > > > > >>> .......
> > > > > > >>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> > > > > > >>> PriviledgedActionException as:cyril
> > > > > > >>>
> > > cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > > > > > Input
> > > > > > >>> path does not exist:
> > > > > file:/home/cyril/workspace/Newer/testdata_seq/data
> > > > > > >>> Exception in thread "main"
> > > > > > >>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > > Input
> > > > > path
> > > > > > >>> does not exist:
> > > file:/home/cyril/workspace/Newer/testdata_seq/data
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> > > > > > >>>   at
> > > > > > >>
> > > > org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> > > > > > >>>   at
> > > > > org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> > > > > > >>>   at
> > > > > org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> > > > > > >>>   at
> > org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> > > > > > >>>   at
> > org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> > > > > > >>>   at java.security.AccessController.doPrivileged(Native
> Method)
> > > > > > >>>   at javax.security.auth.Subject.doAs(Subject.java:416)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> > > > > > >>>   at
> > > > > > >>>
> > > > > >
> > > >
> > org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> > > > > > >>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> > > > > > >>>   at
> > > > org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> > > > > > >>>   at
> > > > > > >>>
> > > > > > >>
> > > > > >
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> > > > > > >>>   at DataFileWriter.main(DataFileWriter.java:85)
> > > > > > >>>
> > > > > > >>>
> > > > > > >>>
> > > > > > >>>
> > > > > > >>> On another note. Is there a command that would allow the
> > program
> > > to
> > > > > > >>> overwrite existing files in the filesystem (I would get
> errors
> > > if I
> > > > > > don't
> > > > > > >>> delete the files before running the program again).
> > > > > > >>>
> > > > > > >>> Thank you for a reply and I hope I have given all the
> necessary
> > > > > output.
> > > > > > >> In
> > > > > > >>> the meantime I will look into it.
> > > > > > >>>
> > > > > > >>> Cyril
> > > > > > >>
> > > > > > >>
> > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>

Re: FileSystem Error

Posted by Cyril Bogus <cy...@gmail.com>.

THANK YOU SO MUCH DAN...

It even solved another problem I was having with Sqoop who couldn't connect
to the hdfs through Java Programming.


On Fri, Mar 29, 2013 at 3:30 PM, Dan Filimon <da...@gmail.com>wrote:

> Maybe this helps?
>
> http://www.opensourceconnections.com/2013/03/24/hdfs-debugging-wrong-fs-expected-file-exception/
>
>
> On Fri, Mar 29, 2013 at 9:27 PM, Cyril Bogus <cy...@gmail.com> wrote:
>
> > Kind of saw this coming since I felt like file:/// will be appended but
> > here is the error I get if I do it
> >
> > ERROR: java.lang.IllegalArgumentException: Wrong FS:
> > hdfs://super:54310/user/cyril/testdata_seq, expected: file:///
> >
> >
> >
> > On Fri, Mar 29, 2013 at 1:27 PM, Dan Filimon <
> dangeorge.filimon@gmail.com
> > >wrote:
> >
> > > One thing that you could try is just using _absolute paths_ everywhere.
> > So,
> > > something on HDFS is hdfs://... whereas something on your local file
> > system
> > > is file://...
> > >
> > >
> > > On Fri, Mar 29, 2013 at 7:05 PM, Cyril Bogus <cy...@gmail.com>
> > wrote:
> > >
> > > > Thank you again Chris.
> > > >
> > > > Yes it is a typo.
> > > >
> > > > After careful reading of the output, my program is exactly doing what
> > you
> > > > describe.
> > > > I am trying to do everything in Hadoop fs but it is creating files on
> > > both
> > > > hadoop fs and class fs and some files are missing. When I run AND
> copy
> > > the
> > > > missing file from hadoop fs into the class file I get the proper
> > > output(no
> > > > errors). And I also get the proper output when I do everything within
> > the
> > > > class file (by removing the property of conf).
> > > >
> > > > But I am trying to automate everything to run on my three node
> cluster
> > > for
> > > > testing within java. So I need to be able to do everything on Hadoop
> > fs.
> > > I
> > > > will look into setting up Mahout for a proper *conf *file.
> > > >
> > > > - Cyril
> > > >
> > > >
> > > > On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <
> chris@heystaks.com
> > > > >wrote:
> > > >
> > > > > Well then do all the various folders exist on the hadoop fs?
> > > > >
> > > > > I also had a similar problem awhile ago where my program ran fine
> but
> > > > then
> > > > > I did something (no idea what) and hadoop started complaining. To
> fix
> > > it
> > > > I
> > > > > had to put everything on the hadoop fs. i.e. was move all <local fs
> > > path
> > > > > to>/data to <hadoop fs path to>data
> > > > >
> > > > > One more strange issue I ran into was where I had identically named
> > > > > folders on both local and hdfs and it was looking in the wrong one.
> > > > >
> > > > > I think that's all the causes I've run into, so if they're not the
> > > cause
> > > > > then I'm out of ideas and hopefully someone else will be able to
> > help.
> > > > >
> > > > > also the missing colon is a typo right? hdfs//mylocation
> > > > >
> > > > > On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
> > > > >
> > > > > > Thank you for the reply Chris,
> > > > > >
> > > > > > I create and write fine on the file system. And the file is there
> > > when
> > > > I
> > > > > > check hadoop. So I do not think the problem is privileges. As I
> > read
> > > > it,
> > > > > > the Canopy Driver is looking for the file under the Class file
> > > > > > (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
> > > > > > (/user/cyrille/) and the file is not there so it gives me the
> error
> > > > that
> > > > > > the file does not exists. But the file exists and was created
> fine
> > > > > "within
> > > > > > the program with the same conf variable"
> > > > > >
> > > > > > - Cyril
> > > > > >
> > > > > >
> > > > > > On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <
> > > chris@heystaks.com
> > > > > >wrote:
> > > > > >
> > > > > >>> security.UserGroupInformation:
> > > > > >>> PriviledgedActionException as:cyril
> > > > > >>
> > > > > >> I'm not entirely sure but sounds like a permissions issue to me.
> > > check
> > > > > all
> > > > > >> the files are owned by the user cyril and not root.
> > > > > >> also did you start hadoop as root and run the program as cyril,
> > > hadoop
> > > > > >> might also complain about that
> > > > > >>
> > > > > >> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
> > > > > >>
> > > > > >>> Hi,
> > > > > >>>
> > > > > >>> I am running a small java program that basically write a small
> > > input
> > > > > data
> > > > > >>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans
> > Clustering
> > > > and
> > > > > >>> then output the content of the data.
> > > > > >>>
> > > > > >>> In my hadoop.properties I have included the core-site.xml
> > > definition
> > > > > for
> > > > > >>> the Java program to connect to my single node setup so that I
> > will
> > > > not
> > > > > >> use
> > > > > >>> the Java Project file system but hadoop instead (Basically all
> > > write
> > > > > and
> > > > > >>> read are done on hadoop and not in the class file).
> > > > > >>>
> > > > > >>> When I run the program, as soon as the Canopy (even the
> KMeans),
> > > > > >>> configuration tries to lookup for the file in the class path
> > > instead
> > > > of
> > > > > >> the
> > > > > >>> Hadoop FileSystem path where the proper files are located.
> > > > > >>>
> > > > > >>> Is there a problem with the way I have my conf defined?
> > > > > >>>
> > > > > >>> hadoop.properties:
> > > > > >>> fs.default.name=hdfs//mylocation
> > > > > >>>
> > > > > >>> Program:
> > > > > >>>
> > > > > >>> public class DataFileWriter {
> > > > > >>>
> > > > > >>>   private static Properties props = new Properties();
> > > > > >>>   private static Configuration conf = new Configuration();
> > > > > >>>
> > > > > >>>   /**
> > > > > >>>    * @param args
> > > > > >>>    * @throws ClassNotFoundException
> > > > > >>>    * @throws InterruptedException
> > > > > >>>    * @throws IOException
> > > > > >>>    */
> > > > > >>>   public static void main(String[] args) throws IOException,
> > > > > >>>           InterruptedException, ClassNotFoundException {
> > > > > >>>
> > > > > >>>       props.load(new FileReader(new File(
> > > > > >>>
> > > "/home/cyril/workspace/Newer/src/hadoop.properties")));
> > > > > >>>
> > > > > >>>       // TODO Auto-generated method stub
> > > > > >>>       FileSystem fs = null;
> > > > > >>>       SequenceFile.Writer writer;
> > > > > >>>       SequenceFile.Reader reader;
> > > > > >>>
> > > > > >>>       conf.set("fs.default.name", props.getProperty("
> > > fs.default.name
> > > > > >> "));
> > > > > >>>
> > > > > >>>       List<NamedVector> vectors = new
> LinkedList<NamedVector>();
> > > > > >>>       NamedVector v1 = new NamedVector(new DenseVector(new
> > > double[] {
> > > > > >> 0.1,
> > > > > >>>               0.2, 0.5 }), "Hello");
> > > > > >>>       vectors.add(v1);
> > > > > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.5,
> > 0.1,
> > > > 0.2
> > > > > >>> }),
> > > > > >>>               "Bored");
> > > > > >>>       vectors.add(v1);
> > > > > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.2,
> > 0.5,
> > > > 0.1
> > > > > >>> }),
> > > > > >>>               "Done");
> > > > > >>>       vectors.add(v1);
> > > > > >>>       // Write the data to SequenceFile
> > > > > >>>       try {
> > > > > >>>           fs = FileSystem.get(conf);
> > > > > >>>
> > > > > >>>           Path path = new Path("testdata_seq/data");
> > > > > >>>           writer = new SequenceFile.Writer(fs, conf, path,
> > > > Text.class,
> > > > > >>>                   VectorWritable.class);
> > > > > >>>
> > > > > >>>           VectorWritable vec = new VectorWritable();
> > > > > >>>           for (NamedVector vector : vectors) {
> > > > > >>>               vec.set(vector);
> > > > > >>>               writer.append(new Text(vector.getName()), vec);
> > > > > >>>           }
> > > > > >>>           writer.close();
> > > > > >>>
> > > > > >>>       } catch (Exception e) {
> > > > > >>>           System.out.println("ERROR: " + e);
> > > > > >>>       }
> > > > > >>>
> > > > > >>>       Path input = new Path("testdata_seq/data");
> > > > > >>>       boolean runSequential = false;
> > > > > >>>       Path clustersOut = new Path("testdata_seq/clusters");
> > > > > >>>       Path clustersIn = new
> > > > > >>> Path("testdata_seq/clusters/clusters-0-final");
> > > > > >>>       double convergenceDelta = 0;
> > > > > >>>       double clusterClassificationThreshold = 0;
> > > > > >>>       boolean runClustering = true;
> > > > > >>>       Path output = new Path("testdata_seq/output");
> > > > > >>>       int maxIterations = 12;
> > > > > >>>       CanopyDriver.run(conf, input, clustersOut, new
> > > > > >>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> > > > > >>> clusterClassificationThreshold, runSequential);
> > > > > >>>       KMeansDriver.run(conf, input, clustersIn, output, new
> > > > > >>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> > > > > >> runClustering,
> > > > > >>> clusterClassificationThreshold, runSequential);
> > > > > >>>
> > > > > >>>       reader = new SequenceFile.Reader(fs,
> > > > > >>>               new
> > > Path("testdata_seq/clusteredPoints/part-m-00000"),
> > > > > >>> conf);
> > > > > >>>
> > > > > >>>       IntWritable key = new IntWritable();
> > > > > >>>       WeightedVectorWritable value = new
> > WeightedVectorWritable();
> > > > > >>>       while (reader.next(key, value)) {
> > > > > >>>         System.out.println(value.toString() + " belongs to
> > cluster
> > > "
> > > > > >>>                            + key.toString());
> > > > > >>>       }
> > > > > >>>   }
> > > > > >>>
> > > > > >>> }
> > > > > >>>
> > > > > >>> Error Output:
> > > > > >>>
> > > > > >>> .......
> > > > > >>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> > > > > >>> PriviledgedActionException as:cyril
> > > > > >>>
> > cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > > > > Input
> > > > > >>> path does not exist:
> > > > file:/home/cyril/workspace/Newer/testdata_seq/data
> > > > > >>> Exception in thread "main"
> > > > > >>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > Input
> > > > path
> > > > > >>> does not exist:
> > file:/home/cyril/workspace/Newer/testdata_seq/data
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> > > > > >>>   at
> > > > > >>
> > > org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> > > > > >>>   at
> > > > org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> > > > > >>>   at
> > > > org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> > > > > >>>   at
> org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> > > > > >>>   at
> org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> > > > > >>>   at java.security.AccessController.doPrivileged(Native Method)
> > > > > >>>   at javax.security.auth.Subject.doAs(Subject.java:416)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> > > > > >>>   at
> > > > > >>>
> > > > >
> > >
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> > > > > >>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> > > > > >>>   at
> > > org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> > > > > >>>   at
> > > > > >>>
> > > > > >>
> > > > >
> > > >
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> > > > > >>>   at DataFileWriter.main(DataFileWriter.java:85)
> > > > > >>>
> > > > > >>>
> > > > > >>>
> > > > > >>>
> > > > > >>> On another note. Is there a command that would allow the
> program
> > to
> > > > > >>> overwrite existing files in the filesystem (I would get errors
> > if I
> > > > > don't
> > > > > >>> delete the files before running the program again).
> > > > > >>>
> > > > > >>> Thank you for a reply and I hope I have given all the necessary
> > > > output.
> > > > > >> In
> > > > > >>> the meantime I will look into it.
> > > > > >>>
> > > > > >>> Cyril
> > > > > >>
> > > > > >>
> > > > >
> > > > >
> > > >
> > >
> >
>

Re: FileSystem Error

Posted by Dan Filimon <da...@gmail.com>.

Maybe this helps?
http://www.opensourceconnections.com/2013/03/24/hdfs-debugging-wrong-fs-expected-file-exception/


On Fri, Mar 29, 2013 at 9:27 PM, Cyril Bogus <cy...@gmail.com> wrote:

> Kind of saw this coming since I felt like file:/// will be appended but
> here is the error I get if I do it
>
> ERROR: java.lang.IllegalArgumentException: Wrong FS:
> hdfs://super:54310/user/cyril/testdata_seq, expected: file:///
>
>
>
> On Fri, Mar 29, 2013 at 1:27 PM, Dan Filimon <dangeorge.filimon@gmail.com
> >wrote:
>
> > One thing that you could try is just using _absolute paths_ everywhere.
> So,
> > something on HDFS is hdfs://... whereas something on your local file
> system
> > is file://...
> >
> >
> > On Fri, Mar 29, 2013 at 7:05 PM, Cyril Bogus <cy...@gmail.com>
> wrote:
> >
> > > Thank you again Chris.
> > >
> > > Yes it is a typo.
> > >
> > > After careful reading of the output, my program is exactly doing what
> you
> > > describe.
> > > I am trying to do everything in Hadoop fs but it is creating files on
> > both
> > > hadoop fs and class fs and some files are missing. When I run AND copy
> > the
> > > missing file from hadoop fs into the class file I get the proper
> > output(no
> > > errors). And I also get the proper output when I do everything within
> the
> > > class file (by removing the property of conf).
> > >
> > > But I am trying to automate everything to run on my three node cluster
> > for
> > > testing within java. So I need to be able to do everything on Hadoop
> fs.
> > I
> > > will look into setting up Mahout for a proper *conf *file.
> > >
> > > - Cyril
> > >
> > >
> > > On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <chris@heystaks.com
> > > >wrote:
> > >
> > > > Well then do all the various folders exist on the hadoop fs?
> > > >
> > > > I also had a similar problem awhile ago where my program ran fine but
> > > then
> > > > I did something (no idea what) and hadoop started complaining. To fix
> > it
> > > I
> > > > had to put everything on the hadoop fs. i.e. was move all <local fs
> > path
> > > > to>/data to <hadoop fs path to>data
> > > >
> > > > One more strange issue I ran into was where I had identically named
> > > > folders on both local and hdfs and it was looking in the wrong one.
> > > >
> > > > I think that's all the causes I've run into, so if they're not the
> > cause
> > > > then I'm out of ideas and hopefully someone else will be able to
> help.
> > > >
> > > > also the missing colon is a typo right? hdfs//mylocation
> > > >
> > > > On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
> > > >
> > > > > Thank you for the reply Chris,
> > > > >
> > > > > I create and write fine on the file system. And the file is there
> > when
> > > I
> > > > > check hadoop. So I do not think the problem is privileges. As I
> read
> > > it,
> > > > > the Canopy Driver is looking for the file under the Class file
> > > > > (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
> > > > > (/user/cyrille/) and the file is not there so it gives me the error
> > > that
> > > > > the file does not exists. But the file exists and was created fine
> > > > "within
> > > > > the program with the same conf variable"
> > > > >
> > > > > - Cyril
> > > > >
> > > > >
> > > > > On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <
> > chris@heystaks.com
> > > > >wrote:
> > > > >
> > > > >>> security.UserGroupInformation:
> > > > >>> PriviledgedActionException as:cyril
> > > > >>
> > > > >> I'm not entirely sure but sounds like a permissions issue to me.
> > check
> > > > all
> > > > >> the files are owned by the user cyril and not root.
> > > > >> also did you start hadoop as root and run the program as cyril,
> > hadoop
> > > > >> might also complain about that
> > > > >>
> > > > >> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
> > > > >>
> > > > >>> Hi,
> > > > >>>
> > > > >>> I am running a small java program that basically write a small
> > input
> > > > data
> > > > >>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans
> Clustering
> > > and
> > > > >>> then output the content of the data.
> > > > >>>
> > > > >>> In my hadoop.properties I have included the core-site.xml
> > definition
> > > > for
> > > > >>> the Java program to connect to my single node setup so that I
> will
> > > not
> > > > >> use
> > > > >>> the Java Project file system but hadoop instead (Basically all
> > write
> > > > and
> > > > >>> read are done on hadoop and not in the class file).
> > > > >>>
> > > > >>> When I run the program, as soon as the Canopy (even the KMeans),
> > > > >>> configuration tries to lookup for the file in the class path
> > instead
> > > of
> > > > >> the
> > > > >>> Hadoop FileSystem path where the proper files are located.
> > > > >>>
> > > > >>> Is there a problem with the way I have my conf defined?
> > > > >>>
> > > > >>> hadoop.properties:
> > > > >>> fs.default.name=hdfs//mylocation
> > > > >>>
> > > > >>> Program:
> > > > >>>
> > > > >>> public class DataFileWriter {
> > > > >>>
> > > > >>>   private static Properties props = new Properties();
> > > > >>>   private static Configuration conf = new Configuration();
> > > > >>>
> > > > >>>   /**
> > > > >>>    * @param args
> > > > >>>    * @throws ClassNotFoundException
> > > > >>>    * @throws InterruptedException
> > > > >>>    * @throws IOException
> > > > >>>    */
> > > > >>>   public static void main(String[] args) throws IOException,
> > > > >>>           InterruptedException, ClassNotFoundException {
> > > > >>>
> > > > >>>       props.load(new FileReader(new File(
> > > > >>>
> > "/home/cyril/workspace/Newer/src/hadoop.properties")));
> > > > >>>
> > > > >>>       // TODO Auto-generated method stub
> > > > >>>       FileSystem fs = null;
> > > > >>>       SequenceFile.Writer writer;
> > > > >>>       SequenceFile.Reader reader;
> > > > >>>
> > > > >>>       conf.set("fs.default.name", props.getProperty("
> > fs.default.name
> > > > >> "));
> > > > >>>
> > > > >>>       List<NamedVector> vectors = new LinkedList<NamedVector>();
> > > > >>>       NamedVector v1 = new NamedVector(new DenseVector(new
> > double[] {
> > > > >> 0.1,
> > > > >>>               0.2, 0.5 }), "Hello");
> > > > >>>       vectors.add(v1);
> > > > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.5,
> 0.1,
> > > 0.2
> > > > >>> }),
> > > > >>>               "Bored");
> > > > >>>       vectors.add(v1);
> > > > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.2,
> 0.5,
> > > 0.1
> > > > >>> }),
> > > > >>>               "Done");
> > > > >>>       vectors.add(v1);
> > > > >>>       // Write the data to SequenceFile
> > > > >>>       try {
> > > > >>>           fs = FileSystem.get(conf);
> > > > >>>
> > > > >>>           Path path = new Path("testdata_seq/data");
> > > > >>>           writer = new SequenceFile.Writer(fs, conf, path,
> > > Text.class,
> > > > >>>                   VectorWritable.class);
> > > > >>>
> > > > >>>           VectorWritable vec = new VectorWritable();
> > > > >>>           for (NamedVector vector : vectors) {
> > > > >>>               vec.set(vector);
> > > > >>>               writer.append(new Text(vector.getName()), vec);
> > > > >>>           }
> > > > >>>           writer.close();
> > > > >>>
> > > > >>>       } catch (Exception e) {
> > > > >>>           System.out.println("ERROR: " + e);
> > > > >>>       }
> > > > >>>
> > > > >>>       Path input = new Path("testdata_seq/data");
> > > > >>>       boolean runSequential = false;
> > > > >>>       Path clustersOut = new Path("testdata_seq/clusters");
> > > > >>>       Path clustersIn = new
> > > > >>> Path("testdata_seq/clusters/clusters-0-final");
> > > > >>>       double convergenceDelta = 0;
> > > > >>>       double clusterClassificationThreshold = 0;
> > > > >>>       boolean runClustering = true;
> > > > >>>       Path output = new Path("testdata_seq/output");
> > > > >>>       int maxIterations = 12;
> > > > >>>       CanopyDriver.run(conf, input, clustersOut, new
> > > > >>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> > > > >>> clusterClassificationThreshold, runSequential);
> > > > >>>       KMeansDriver.run(conf, input, clustersIn, output, new
> > > > >>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> > > > >> runClustering,
> > > > >>> clusterClassificationThreshold, runSequential);
> > > > >>>
> > > > >>>       reader = new SequenceFile.Reader(fs,
> > > > >>>               new
> > Path("testdata_seq/clusteredPoints/part-m-00000"),
> > > > >>> conf);
> > > > >>>
> > > > >>>       IntWritable key = new IntWritable();
> > > > >>>       WeightedVectorWritable value = new
> WeightedVectorWritable();
> > > > >>>       while (reader.next(key, value)) {
> > > > >>>         System.out.println(value.toString() + " belongs to
> cluster
> > "
> > > > >>>                            + key.toString());
> > > > >>>       }
> > > > >>>   }
> > > > >>>
> > > > >>> }
> > > > >>>
> > > > >>> Error Output:
> > > > >>>
> > > > >>> .......
> > > > >>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> > > > >>> PriviledgedActionException as:cyril
> > > > >>>
> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > > > Input
> > > > >>> path does not exist:
> > > file:/home/cyril/workspace/Newer/testdata_seq/data
> > > > >>> Exception in thread "main"
> > > > >>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> Input
> > > path
> > > > >>> does not exist:
> file:/home/cyril/workspace/Newer/testdata_seq/data
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> > > > >>>   at
> > > > >>
> > org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> > > > >>>   at
> > > org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> > > > >>>   at
> > > org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> > > > >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> > > > >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> > > > >>>   at java.security.AccessController.doPrivileged(Native Method)
> > > > >>>   at javax.security.auth.Subject.doAs(Subject.java:416)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> > > > >>>   at
> > > > >>>
> > > >
> > org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> > > > >>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> > > > >>>   at
> > org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> > > > >>>   at
> > > > >>>
> > > > >>
> > > >
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> > > > >>>   at DataFileWriter.main(DataFileWriter.java:85)
> > > > >>>
> > > > >>>
> > > > >>>
> > > > >>>
> > > > >>> On another note. Is there a command that would allow the program
> to
> > > > >>> overwrite existing files in the filesystem (I would get errors
> if I
> > > > don't
> > > > >>> delete the files before running the program again).
> > > > >>>
> > > > >>> Thank you for a reply and I hope I have given all the necessary
> > > output.
> > > > >> In
> > > > >>> the meantime I will look into it.
> > > > >>>
> > > > >>> Cyril
> > > > >>
> > > > >>
> > > >
> > > >
> > >
> >
>

Re: FileSystem Error

Posted by Cyril Bogus <cy...@gmail.com>.

Kind of saw this coming since I felt like file:/// will be appended but
here is the error I get if I do it

ERROR: java.lang.IllegalArgumentException: Wrong FS:
hdfs://super:54310/user/cyril/testdata_seq, expected: file:///



On Fri, Mar 29, 2013 at 1:27 PM, Dan Filimon <da...@gmail.com>wrote:

> One thing that you could try is just using _absolute paths_ everywhere. So,
> something on HDFS is hdfs://... whereas something on your local file system
> is file://...
>
>
> On Fri, Mar 29, 2013 at 7:05 PM, Cyril Bogus <cy...@gmail.com> wrote:
>
> > Thank you again Chris.
> >
> > Yes it is a typo.
> >
> > After careful reading of the output, my program is exactly doing what you
> > describe.
> > I am trying to do everything in Hadoop fs but it is creating files on
> both
> > hadoop fs and class fs and some files are missing. When I run AND copy
> the
> > missing file from hadoop fs into the class file I get the proper
> output(no
> > errors). And I also get the proper output when I do everything within the
> > class file (by removing the property of conf).
> >
> > But I am trying to automate everything to run on my three node cluster
> for
> > testing within java. So I need to be able to do everything on Hadoop fs.
> I
> > will look into setting up Mahout for a proper *conf *file.
> >
> > - Cyril
> >
> >
> > On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <chris@heystaks.com
> > >wrote:
> >
> > > Well then do all the various folders exist on the hadoop fs?
> > >
> > > I also had a similar problem awhile ago where my program ran fine but
> > then
> > > I did something (no idea what) and hadoop started complaining. To fix
> it
> > I
> > > had to put everything on the hadoop fs. i.e. was move all <local fs
> path
> > > to>/data to <hadoop fs path to>data
> > >
> > > One more strange issue I ran into was where I had identically named
> > > folders on both local and hdfs and it was looking in the wrong one.
> > >
> > > I think that's all the causes I've run into, so if they're not the
> cause
> > > then I'm out of ideas and hopefully someone else will be able to help.
> > >
> > > also the missing colon is a typo right? hdfs//mylocation
> > >
> > > On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
> > >
> > > > Thank you for the reply Chris,
> > > >
> > > > I create and write fine on the file system. And the file is there
> when
> > I
> > > > check hadoop. So I do not think the problem is privileges. As I read
> > it,
> > > > the Canopy Driver is looking for the file under the Class file
> > > > (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
> > > > (/user/cyrille/) and the file is not there so it gives me the error
> > that
> > > > the file does not exists. But the file exists and was created fine
> > > "within
> > > > the program with the same conf variable"
> > > >
> > > > - Cyril
> > > >
> > > >
> > > > On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <
> chris@heystaks.com
> > > >wrote:
> > > >
> > > >>> security.UserGroupInformation:
> > > >>> PriviledgedActionException as:cyril
> > > >>
> > > >> I'm not entirely sure but sounds like a permissions issue to me.
> check
> > > all
> > > >> the files are owned by the user cyril and not root.
> > > >> also did you start hadoop as root and run the program as cyril,
> hadoop
> > > >> might also complain about that
> > > >>
> > > >> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
> > > >>
> > > >>> Hi,
> > > >>>
> > > >>> I am running a small java program that basically write a small
> input
> > > data
> > > >>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering
> > and
> > > >>> then output the content of the data.
> > > >>>
> > > >>> In my hadoop.properties I have included the core-site.xml
> definition
> > > for
> > > >>> the Java program to connect to my single node setup so that I will
> > not
> > > >> use
> > > >>> the Java Project file system but hadoop instead (Basically all
> write
> > > and
> > > >>> read are done on hadoop and not in the class file).
> > > >>>
> > > >>> When I run the program, as soon as the Canopy (even the KMeans),
> > > >>> configuration tries to lookup for the file in the class path
> instead
> > of
> > > >> the
> > > >>> Hadoop FileSystem path where the proper files are located.
> > > >>>
> > > >>> Is there a problem with the way I have my conf defined?
> > > >>>
> > > >>> hadoop.properties:
> > > >>> fs.default.name=hdfs//mylocation
> > > >>>
> > > >>> Program:
> > > >>>
> > > >>> public class DataFileWriter {
> > > >>>
> > > >>>   private static Properties props = new Properties();
> > > >>>   private static Configuration conf = new Configuration();
> > > >>>
> > > >>>   /**
> > > >>>    * @param args
> > > >>>    * @throws ClassNotFoundException
> > > >>>    * @throws InterruptedException
> > > >>>    * @throws IOException
> > > >>>    */
> > > >>>   public static void main(String[] args) throws IOException,
> > > >>>           InterruptedException, ClassNotFoundException {
> > > >>>
> > > >>>       props.load(new FileReader(new File(
> > > >>>
> "/home/cyril/workspace/Newer/src/hadoop.properties")));
> > > >>>
> > > >>>       // TODO Auto-generated method stub
> > > >>>       FileSystem fs = null;
> > > >>>       SequenceFile.Writer writer;
> > > >>>       SequenceFile.Reader reader;
> > > >>>
> > > >>>       conf.set("fs.default.name", props.getProperty("
> fs.default.name
> > > >> "));
> > > >>>
> > > >>>       List<NamedVector> vectors = new LinkedList<NamedVector>();
> > > >>>       NamedVector v1 = new NamedVector(new DenseVector(new
> double[] {
> > > >> 0.1,
> > > >>>               0.2, 0.5 }), "Hello");
> > > >>>       vectors.add(v1);
> > > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1,
> > 0.2
> > > >>> }),
> > > >>>               "Bored");
> > > >>>       vectors.add(v1);
> > > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5,
> > 0.1
> > > >>> }),
> > > >>>               "Done");
> > > >>>       vectors.add(v1);
> > > >>>       // Write the data to SequenceFile
> > > >>>       try {
> > > >>>           fs = FileSystem.get(conf);
> > > >>>
> > > >>>           Path path = new Path("testdata_seq/data");
> > > >>>           writer = new SequenceFile.Writer(fs, conf, path,
> > Text.class,
> > > >>>                   VectorWritable.class);
> > > >>>
> > > >>>           VectorWritable vec = new VectorWritable();
> > > >>>           for (NamedVector vector : vectors) {
> > > >>>               vec.set(vector);
> > > >>>               writer.append(new Text(vector.getName()), vec);
> > > >>>           }
> > > >>>           writer.close();
> > > >>>
> > > >>>       } catch (Exception e) {
> > > >>>           System.out.println("ERROR: " + e);
> > > >>>       }
> > > >>>
> > > >>>       Path input = new Path("testdata_seq/data");
> > > >>>       boolean runSequential = false;
> > > >>>       Path clustersOut = new Path("testdata_seq/clusters");
> > > >>>       Path clustersIn = new
> > > >>> Path("testdata_seq/clusters/clusters-0-final");
> > > >>>       double convergenceDelta = 0;
> > > >>>       double clusterClassificationThreshold = 0;
> > > >>>       boolean runClustering = true;
> > > >>>       Path output = new Path("testdata_seq/output");
> > > >>>       int maxIterations = 12;
> > > >>>       CanopyDriver.run(conf, input, clustersOut, new
> > > >>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> > > >>> clusterClassificationThreshold, runSequential);
> > > >>>       KMeansDriver.run(conf, input, clustersIn, output, new
> > > >>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> > > >> runClustering,
> > > >>> clusterClassificationThreshold, runSequential);
> > > >>>
> > > >>>       reader = new SequenceFile.Reader(fs,
> > > >>>               new
> Path("testdata_seq/clusteredPoints/part-m-00000"),
> > > >>> conf);
> > > >>>
> > > >>>       IntWritable key = new IntWritable();
> > > >>>       WeightedVectorWritable value = new WeightedVectorWritable();
> > > >>>       while (reader.next(key, value)) {
> > > >>>         System.out.println(value.toString() + " belongs to cluster
> "
> > > >>>                            + key.toString());
> > > >>>       }
> > > >>>   }
> > > >>>
> > > >>> }
> > > >>>
> > > >>> Error Output:
> > > >>>
> > > >>> .......
> > > >>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> > > >>> PriviledgedActionException as:cyril
> > > >>> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > > Input
> > > >>> path does not exist:
> > file:/home/cyril/workspace/Newer/testdata_seq/data
> > > >>> Exception in thread "main"
> > > >>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> > path
> > > >>> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> > > >>>   at
> > > >>
> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> > > >>>   at
> > org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> > > >>>   at
> > org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> > > >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> > > >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> > > >>>   at java.security.AccessController.doPrivileged(Native Method)
> > > >>>   at javax.security.auth.Subject.doAs(Subject.java:416)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> > > >>>   at
> > > >>>
> > >
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> > > >>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> > > >>>   at
> org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> > > >>>   at
> > > >>>
> > > >>
> > >
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> > > >>>   at DataFileWriter.main(DataFileWriter.java:85)
> > > >>>
> > > >>>
> > > >>>
> > > >>>
> > > >>> On another note. Is there a command that would allow the program to
> > > >>> overwrite existing files in the filesystem (I would get errors if I
> > > don't
> > > >>> delete the files before running the program again).
> > > >>>
> > > >>> Thank you for a reply and I hope I have given all the necessary
> > output.
> > > >> In
> > > >>> the meantime I will look into it.
> > > >>>
> > > >>> Cyril
> > > >>
> > > >>
> > >
> > >
> >
>

Re: FileSystem Error

Posted by Dan Filimon <da...@gmail.com>.

One thing that you could try is just using _absolute paths_ everywhere. So,
something on HDFS is hdfs://... whereas something on your local file system
is file://...


On Fri, Mar 29, 2013 at 7:05 PM, Cyril Bogus <cy...@gmail.com> wrote:

> Thank you again Chris.
>
> Yes it is a typo.
>
> After careful reading of the output, my program is exactly doing what you
> describe.
> I am trying to do everything in Hadoop fs but it is creating files on both
> hadoop fs and class fs and some files are missing. When I run AND copy the
> missing file from hadoop fs into the class file I get the proper output(no
> errors). And I also get the proper output when I do everything within the
> class file (by removing the property of conf).
>
> But I am trying to automate everything to run on my three node cluster for
> testing within java. So I need to be able to do everything on Hadoop fs. I
> will look into setting up Mahout for a proper *conf *file.
>
> - Cyril
>
>
> On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <chris@heystaks.com
> >wrote:
>
> > Well then do all the various folders exist on the hadoop fs?
> >
> > I also had a similar problem awhile ago where my program ran fine but
> then
> > I did something (no idea what) and hadoop started complaining. To fix it
> I
> > had to put everything on the hadoop fs. i.e. was move all <local fs path
> > to>/data to <hadoop fs path to>data
> >
> > One more strange issue I ran into was where I had identically named
> > folders on both local and hdfs and it was looking in the wrong one.
> >
> > I think that's all the causes I've run into, so if they're not the cause
> > then I'm out of ideas and hopefully someone else will be able to help.
> >
> > also the missing colon is a typo right? hdfs//mylocation
> >
> > On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
> >
> > > Thank you for the reply Chris,
> > >
> > > I create and write fine on the file system. And the file is there when
> I
> > > check hadoop. So I do not think the problem is privileges. As I read
> it,
> > > the Canopy Driver is looking for the file under the Class file
> > > (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
> > > (/user/cyrille/) and the file is not there so it gives me the error
> that
> > > the file does not exists. But the file exists and was created fine
> > "within
> > > the program with the same conf variable"
> > >
> > > - Cyril
> > >
> > >
> > > On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <chris@heystaks.com
> > >wrote:
> > >
> > >>> security.UserGroupInformation:
> > >>> PriviledgedActionException as:cyril
> > >>
> > >> I'm not entirely sure but sounds like a permissions issue to me. check
> > all
> > >> the files are owned by the user cyril and not root.
> > >> also did you start hadoop as root and run the program as cyril, hadoop
> > >> might also complain about that
> > >>
> > >> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
> > >>
> > >>> Hi,
> > >>>
> > >>> I am running a small java program that basically write a small input
> > data
> > >>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering
> and
> > >>> then output the content of the data.
> > >>>
> > >>> In my hadoop.properties I have included the core-site.xml definition
> > for
> > >>> the Java program to connect to my single node setup so that I will
> not
> > >> use
> > >>> the Java Project file system but hadoop instead (Basically all write
> > and
> > >>> read are done on hadoop and not in the class file).
> > >>>
> > >>> When I run the program, as soon as the Canopy (even the KMeans),
> > >>> configuration tries to lookup for the file in the class path instead
> of
> > >> the
> > >>> Hadoop FileSystem path where the proper files are located.
> > >>>
> > >>> Is there a problem with the way I have my conf defined?
> > >>>
> > >>> hadoop.properties:
> > >>> fs.default.name=hdfs//mylocation
> > >>>
> > >>> Program:
> > >>>
> > >>> public class DataFileWriter {
> > >>>
> > >>>   private static Properties props = new Properties();
> > >>>   private static Configuration conf = new Configuration();
> > >>>
> > >>>   /**
> > >>>    * @param args
> > >>>    * @throws ClassNotFoundException
> > >>>    * @throws InterruptedException
> > >>>    * @throws IOException
> > >>>    */
> > >>>   public static void main(String[] args) throws IOException,
> > >>>           InterruptedException, ClassNotFoundException {
> > >>>
> > >>>       props.load(new FileReader(new File(
> > >>>               "/home/cyril/workspace/Newer/src/hadoop.properties")));
> > >>>
> > >>>       // TODO Auto-generated method stub
> > >>>       FileSystem fs = null;
> > >>>       SequenceFile.Writer writer;
> > >>>       SequenceFile.Reader reader;
> > >>>
> > >>>       conf.set("fs.default.name", props.getProperty("fs.default.name
> > >> "));
> > >>>
> > >>>       List<NamedVector> vectors = new LinkedList<NamedVector>();
> > >>>       NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> > >> 0.1,
> > >>>               0.2, 0.5 }), "Hello");
> > >>>       vectors.add(v1);
> > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1,
> 0.2
> > >>> }),
> > >>>               "Bored");
> > >>>       vectors.add(v1);
> > >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5,
> 0.1
> > >>> }),
> > >>>               "Done");
> > >>>       vectors.add(v1);
> > >>>       // Write the data to SequenceFile
> > >>>       try {
> > >>>           fs = FileSystem.get(conf);
> > >>>
> > >>>           Path path = new Path("testdata_seq/data");
> > >>>           writer = new SequenceFile.Writer(fs, conf, path,
> Text.class,
> > >>>                   VectorWritable.class);
> > >>>
> > >>>           VectorWritable vec = new VectorWritable();
> > >>>           for (NamedVector vector : vectors) {
> > >>>               vec.set(vector);
> > >>>               writer.append(new Text(vector.getName()), vec);
> > >>>           }
> > >>>           writer.close();
> > >>>
> > >>>       } catch (Exception e) {
> > >>>           System.out.println("ERROR: " + e);
> > >>>       }
> > >>>
> > >>>       Path input = new Path("testdata_seq/data");
> > >>>       boolean runSequential = false;
> > >>>       Path clustersOut = new Path("testdata_seq/clusters");
> > >>>       Path clustersIn = new
> > >>> Path("testdata_seq/clusters/clusters-0-final");
> > >>>       double convergenceDelta = 0;
> > >>>       double clusterClassificationThreshold = 0;
> > >>>       boolean runClustering = true;
> > >>>       Path output = new Path("testdata_seq/output");
> > >>>       int maxIterations = 12;
> > >>>       CanopyDriver.run(conf, input, clustersOut, new
> > >>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> > >>> clusterClassificationThreshold, runSequential);
> > >>>       KMeansDriver.run(conf, input, clustersIn, output, new
> > >>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> > >> runClustering,
> > >>> clusterClassificationThreshold, runSequential);
> > >>>
> > >>>       reader = new SequenceFile.Reader(fs,
> > >>>               new Path("testdata_seq/clusteredPoints/part-m-00000"),
> > >>> conf);
> > >>>
> > >>>       IntWritable key = new IntWritable();
> > >>>       WeightedVectorWritable value = new WeightedVectorWritable();
> > >>>       while (reader.next(key, value)) {
> > >>>         System.out.println(value.toString() + " belongs to cluster "
> > >>>                            + key.toString());
> > >>>       }
> > >>>   }
> > >>>
> > >>> }
> > >>>
> > >>> Error Output:
> > >>>
> > >>> .......
> > >>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> > >>> PriviledgedActionException as:cyril
> > >>> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> > Input
> > >>> path does not exist:
> file:/home/cyril/workspace/Newer/testdata_seq/data
> > >>> Exception in thread "main"
> > >>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> path
> > >>> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> > >>>   at
> > >>>
> > >>
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> > >>>   at
> > >> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> > >>>   at
> org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> > >>>   at
> org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> > >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> > >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> > >>>   at java.security.AccessController.doPrivileged(Native Method)
> > >>>   at javax.security.auth.Subject.doAs(Subject.java:416)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> > >>>   at
> > >>>
> > org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> > >>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> > >>>   at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> > >>>   at
> > >>>
> > >>
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> > >>>   at DataFileWriter.main(DataFileWriter.java:85)
> > >>>
> > >>>
> > >>>
> > >>>
> > >>> On another note. Is there a command that would allow the program to
> > >>> overwrite existing files in the filesystem (I would get errors if I
> > don't
> > >>> delete the files before running the program again).
> > >>>
> > >>> Thank you for a reply and I hope I have given all the necessary
> output.
> > >> In
> > >>> the meantime I will look into it.
> > >>>
> > >>> Cyril
> > >>
> > >>
> >
> >
>

Re: FileSystem Error

Posted by Cyril Bogus <cy...@gmail.com>.

Thank you again Chris.

Yes it is a typo.

After careful reading of the output, my program is exactly doing what you
describe.
I am trying to do everything in Hadoop fs but it is creating files on both
hadoop fs and class fs and some files are missing. When I run AND copy the
missing file from hadoop fs into the class file I get the proper output(no
errors). And I also get the proper output when I do everything within the
class file (by removing the property of conf).

But I am trying to automate everything to run on my three node cluster for
testing within java. So I need to be able to do everything on Hadoop fs. I
will look into setting up Mahout for a proper *conf *file.

- Cyril


On Fri, Mar 29, 2013 at 12:34 PM, Chris Harrington <ch...@heystaks.com>wrote:

> Well then do all the various folders exist on the hadoop fs?
>
> I also had a similar problem awhile ago where my program ran fine but then
> I did something (no idea what) and hadoop started complaining. To fix it I
> had to put everything on the hadoop fs. i.e. was move all <local fs path
> to>/data to <hadoop fs path to>data
>
> One more strange issue I ran into was where I had identically named
> folders on both local and hdfs and it was looking in the wrong one.
>
> I think that's all the causes I've run into, so if they're not the cause
> then I'm out of ideas and hopefully someone else will be able to help.
>
> also the missing colon is a typo right? hdfs//mylocation
>
> On 29 Mar 2013, at 16:09, Cyril Bogus wrote:
>
> > Thank you for the reply Chris,
> >
> > I create and write fine on the file system. And the file is there when I
> > check hadoop. So I do not think the problem is privileges. As I read it,
> > the Canopy Driver is looking for the file under the Class file
> > (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
> > (/user/cyrille/) and the file is not there so it gives me the error that
> > the file does not exists. But the file exists and was created fine
> "within
> > the program with the same conf variable"
> >
> > - Cyril
> >
> >
> > On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <chris@heystaks.com
> >wrote:
> >
> >>> security.UserGroupInformation:
> >>> PriviledgedActionException as:cyril
> >>
> >> I'm not entirely sure but sounds like a permissions issue to me. check
> all
> >> the files are owned by the user cyril and not root.
> >> also did you start hadoop as root and run the program as cyril, hadoop
> >> might also complain about that
> >>
> >> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
> >>
> >>> Hi,
> >>>
> >>> I am running a small java program that basically write a small input
> data
> >>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> >>> then output the content of the data.
> >>>
> >>> In my hadoop.properties I have included the core-site.xml definition
> for
> >>> the Java program to connect to my single node setup so that I will not
> >> use
> >>> the Java Project file system but hadoop instead (Basically all write
> and
> >>> read are done on hadoop and not in the class file).
> >>>
> >>> When I run the program, as soon as the Canopy (even the KMeans),
> >>> configuration tries to lookup for the file in the class path instead of
> >> the
> >>> Hadoop FileSystem path where the proper files are located.
> >>>
> >>> Is there a problem with the way I have my conf defined?
> >>>
> >>> hadoop.properties:
> >>> fs.default.name=hdfs//mylocation
> >>>
> >>> Program:
> >>>
> >>> public class DataFileWriter {
> >>>
> >>>   private static Properties props = new Properties();
> >>>   private static Configuration conf = new Configuration();
> >>>
> >>>   /**
> >>>    * @param args
> >>>    * @throws ClassNotFoundException
> >>>    * @throws InterruptedException
> >>>    * @throws IOException
> >>>    */
> >>>   public static void main(String[] args) throws IOException,
> >>>           InterruptedException, ClassNotFoundException {
> >>>
> >>>       props.load(new FileReader(new File(
> >>>               "/home/cyril/workspace/Newer/src/hadoop.properties")));
> >>>
> >>>       // TODO Auto-generated method stub
> >>>       FileSystem fs = null;
> >>>       SequenceFile.Writer writer;
> >>>       SequenceFile.Reader reader;
> >>>
> >>>       conf.set("fs.default.name", props.getProperty("fs.default.name
> >> "));
> >>>
> >>>       List<NamedVector> vectors = new LinkedList<NamedVector>();
> >>>       NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> >> 0.1,
> >>>               0.2, 0.5 }), "Hello");
> >>>       vectors.add(v1);
> >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> >>> }),
> >>>               "Bored");
> >>>       vectors.add(v1);
> >>>       v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> >>> }),
> >>>               "Done");
> >>>       vectors.add(v1);
> >>>       // Write the data to SequenceFile
> >>>       try {
> >>>           fs = FileSystem.get(conf);
> >>>
> >>>           Path path = new Path("testdata_seq/data");
> >>>           writer = new SequenceFile.Writer(fs, conf, path, Text.class,
> >>>                   VectorWritable.class);
> >>>
> >>>           VectorWritable vec = new VectorWritable();
> >>>           for (NamedVector vector : vectors) {
> >>>               vec.set(vector);
> >>>               writer.append(new Text(vector.getName()), vec);
> >>>           }
> >>>           writer.close();
> >>>
> >>>       } catch (Exception e) {
> >>>           System.out.println("ERROR: " + e);
> >>>       }
> >>>
> >>>       Path input = new Path("testdata_seq/data");
> >>>       boolean runSequential = false;
> >>>       Path clustersOut = new Path("testdata_seq/clusters");
> >>>       Path clustersIn = new
> >>> Path("testdata_seq/clusters/clusters-0-final");
> >>>       double convergenceDelta = 0;
> >>>       double clusterClassificationThreshold = 0;
> >>>       boolean runClustering = true;
> >>>       Path output = new Path("testdata_seq/output");
> >>>       int maxIterations = 12;
> >>>       CanopyDriver.run(conf, input, clustersOut, new
> >>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> >>> clusterClassificationThreshold, runSequential);
> >>>       KMeansDriver.run(conf, input, clustersIn, output, new
> >>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> >> runClustering,
> >>> clusterClassificationThreshold, runSequential);
> >>>
> >>>       reader = new SequenceFile.Reader(fs,
> >>>               new Path("testdata_seq/clusteredPoints/part-m-00000"),
> >>> conf);
> >>>
> >>>       IntWritable key = new IntWritable();
> >>>       WeightedVectorWritable value = new WeightedVectorWritable();
> >>>       while (reader.next(key, value)) {
> >>>         System.out.println(value.toString() + " belongs to cluster "
> >>>                            + key.toString());
> >>>       }
> >>>   }
> >>>
> >>> }
> >>>
> >>> Error Output:
> >>>
> >>> .......
> >>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> >>> PriviledgedActionException as:cyril
> >>> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException:
> Input
> >>> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> >>> Exception in thread "main"
> >>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> >>> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> >>>   at
> >>>
> >>
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> >>>   at
> >>>
> >>
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> >>>   at
> >>>
> >>
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> >>>   at
> >> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> >>>   at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> >>>   at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> >>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> >>>   at java.security.AccessController.doPrivileged(Native Method)
> >>>   at javax.security.auth.Subject.doAs(Subject.java:416)
> >>>   at
> >>>
> >>
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> >>>   at
> >>>
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> >>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> >>>   at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> >>>   at
> >>>
> >>
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> >>>   at
> >>>
> >>
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> >>>   at
> >>>
> >>
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> >>>   at
> >>>
> >>
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> >>>   at DataFileWriter.main(DataFileWriter.java:85)
> >>>
> >>>
> >>>
> >>>
> >>> On another note. Is there a command that would allow the program to
> >>> overwrite existing files in the filesystem (I would get errors if I
> don't
> >>> delete the files before running the program again).
> >>>
> >>> Thank you for a reply and I hope I have given all the necessary output.
> >> In
> >>> the meantime I will look into it.
> >>>
> >>> Cyril
> >>
> >>
>
>

Re: FileSystem Error

Posted by Chris Harrington <ch...@heystaks.com>.

Well then do all the various folders exist on the hadoop fs?

I also had a similar problem awhile ago where my program ran fine but then I did something (no idea what) and hadoop started complaining. To fix it I had to put everything on the hadoop fs. i.e. was move all <local fs path to>/data to <hadoop fs path to>data

One more strange issue I ran into was where I had identically named folders on both local and hdfs and it was looking in the wrong one.

I think that's all the causes I've run into, so if they're not the cause then I'm out of ideas and hopefully someone else will be able to help.

also the missing colon is a typo right? hdfs//mylocation

On 29 Mar 2013, at 16:09, Cyril Bogus wrote:

> Thank you for the reply Chris,
> 
> I create and write fine on the file system. And the file is there when I
> check hadoop. So I do not think the problem is privileges. As I read it,
> the Canopy Driver is looking for the file under the Class file
> (/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
> (/user/cyrille/) and the file is not there so it gives me the error that
> the file does not exists. But the file exists and was created fine "within
> the program with the same conf variable"
> 
> - Cyril
> 
> 
> On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <ch...@heystaks.com>wrote:
> 
>>> security.UserGroupInformation:
>>> PriviledgedActionException as:cyril
>> 
>> I'm not entirely sure but sounds like a permissions issue to me. check all
>> the files are owned by the user cyril and not root.
>> also did you start hadoop as root and run the program as cyril, hadoop
>> might also complain about that
>> 
>> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
>> 
>>> Hi,
>>> 
>>> I am running a small java program that basically write a small input data
>>> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
>>> then output the content of the data.
>>> 
>>> In my hadoop.properties I have included the core-site.xml definition for
>>> the Java program to connect to my single node setup so that I will not
>> use
>>> the Java Project file system but hadoop instead (Basically all write and
>>> read are done on hadoop and not in the class file).
>>> 
>>> When I run the program, as soon as the Canopy (even the KMeans),
>>> configuration tries to lookup for the file in the class path instead of
>> the
>>> Hadoop FileSystem path where the proper files are located.
>>> 
>>> Is there a problem with the way I have my conf defined?
>>> 
>>> hadoop.properties:
>>> fs.default.name=hdfs//mylocation
>>> 
>>> Program:
>>> 
>>> public class DataFileWriter {
>>> 
>>>   private static Properties props = new Properties();
>>>   private static Configuration conf = new Configuration();
>>> 
>>>   /**
>>>    * @param args
>>>    * @throws ClassNotFoundException
>>>    * @throws InterruptedException
>>>    * @throws IOException
>>>    */
>>>   public static void main(String[] args) throws IOException,
>>>           InterruptedException, ClassNotFoundException {
>>> 
>>>       props.load(new FileReader(new File(
>>>               "/home/cyril/workspace/Newer/src/hadoop.properties")));
>>> 
>>>       // TODO Auto-generated method stub
>>>       FileSystem fs = null;
>>>       SequenceFile.Writer writer;
>>>       SequenceFile.Reader reader;
>>> 
>>>       conf.set("fs.default.name", props.getProperty("fs.default.name
>> "));
>>> 
>>>       List<NamedVector> vectors = new LinkedList<NamedVector>();
>>>       NamedVector v1 = new NamedVector(new DenseVector(new double[] {
>> 0.1,
>>>               0.2, 0.5 }), "Hello");
>>>       vectors.add(v1);
>>>       v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
>>> }),
>>>               "Bored");
>>>       vectors.add(v1);
>>>       v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
>>> }),
>>>               "Done");
>>>       vectors.add(v1);
>>>       // Write the data to SequenceFile
>>>       try {
>>>           fs = FileSystem.get(conf);
>>> 
>>>           Path path = new Path("testdata_seq/data");
>>>           writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>>>                   VectorWritable.class);
>>> 
>>>           VectorWritable vec = new VectorWritable();
>>>           for (NamedVector vector : vectors) {
>>>               vec.set(vector);
>>>               writer.append(new Text(vector.getName()), vec);
>>>           }
>>>           writer.close();
>>> 
>>>       } catch (Exception e) {
>>>           System.out.println("ERROR: " + e);
>>>       }
>>> 
>>>       Path input = new Path("testdata_seq/data");
>>>       boolean runSequential = false;
>>>       Path clustersOut = new Path("testdata_seq/clusters");
>>>       Path clustersIn = new
>>> Path("testdata_seq/clusters/clusters-0-final");
>>>       double convergenceDelta = 0;
>>>       double clusterClassificationThreshold = 0;
>>>       boolean runClustering = true;
>>>       Path output = new Path("testdata_seq/output");
>>>       int maxIterations = 12;
>>>       CanopyDriver.run(conf, input, clustersOut, new
>>> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
>>> clusterClassificationThreshold, runSequential);
>>>       KMeansDriver.run(conf, input, clustersIn, output, new
>>> EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
>> runClustering,
>>> clusterClassificationThreshold, runSequential);
>>> 
>>>       reader = new SequenceFile.Reader(fs,
>>>               new Path("testdata_seq/clusteredPoints/part-m-00000"),
>>> conf);
>>> 
>>>       IntWritable key = new IntWritable();
>>>       WeightedVectorWritable value = new WeightedVectorWritable();
>>>       while (reader.next(key, value)) {
>>>         System.out.println(value.toString() + " belongs to cluster "
>>>                            + key.toString());
>>>       }
>>>   }
>>> 
>>> }
>>> 
>>> Error Output:
>>> 
>>> .......
>>> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
>>> PriviledgedActionException as:cyril
>>> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
>>> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>>> Exception in thread "main"
>>> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
>>> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>>>   at
>>> 
>> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>>>   at
>>> 
>> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>>>   at
>>> 
>> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>>>   at
>> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>>>   at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>>>   at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>>>   at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>>>   at java.security.AccessController.doPrivileged(Native Method)
>>>   at javax.security.auth.Subject.doAs(Subject.java:416)
>>>   at
>>> 
>> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>>>   at
>>> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>>>   at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>>>   at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>>>   at
>>> 
>> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>>>   at
>>> 
>> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>>>   at
>>> 
>> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>>>   at
>>> 
>> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>>>   at DataFileWriter.main(DataFileWriter.java:85)
>>> 
>>> 
>>> 
>>> 
>>> On another note. Is there a command that would allow the program to
>>> overwrite existing files in the filesystem (I would get errors if I don't
>>> delete the files before running the program again).
>>> 
>>> Thank you for a reply and I hope I have given all the necessary output.
>> In
>>> the meantime I will look into it.
>>> 
>>> Cyril
>> 
>>

Re: FileSystem Error

Posted by Cyril Bogus <cy...@gmail.com>.

Thank you for the reply Chris,

I create and write fine on the file system. And the file is there when I
check hadoop. So I do not think the problem is privileges. As I read it,
the Canopy Driver is looking for the file under the Class file
(/home/cyrille/DataWriter/src/testdata_seq/) instead of Hadoop's
(/user/cyrille/) and the file is not there so it gives me the error that
the file does not exists. But the file exists and was created fine "within
the program with the same conf variable"

- Cyril


On Fri, Mar 29, 2013 at 12:01 PM, Chris Harrington <ch...@heystaks.com>wrote:

> > security.UserGroupInformation:
> > PriviledgedActionException as:cyril
>
> I'm not entirely sure but sounds like a permissions issue to me. check all
> the files are owned by the user cyril and not root.
> also did you start hadoop as root and run the program as cyril, hadoop
> might also complain about that
>
> On 29 Mar 2013, at 15:54, Cyril Bogus wrote:
>
> > Hi,
> >
> > I am running a small java program that basically write a small input data
> > to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> > then output the content of the data.
> >
> > In my hadoop.properties I have included the core-site.xml definition for
> > the Java program to connect to my single node setup so that I will not
> use
> > the Java Project file system but hadoop instead (Basically all write and
> > read are done on hadoop and not in the class file).
> >
> > When I run the program, as soon as the Canopy (even the KMeans),
> > configuration tries to lookup for the file in the class path instead of
> the
> > Hadoop FileSystem path where the proper files are located.
> >
> > Is there a problem with the way I have my conf defined?
> >
> > hadoop.properties:
> > fs.default.name=hdfs//mylocation
> >
> > Program:
> >
> > public class DataFileWriter {
> >
> >    private static Properties props = new Properties();
> >    private static Configuration conf = new Configuration();
> >
> >    /**
> >     * @param args
> >     * @throws ClassNotFoundException
> >     * @throws InterruptedException
> >     * @throws IOException
> >     */
> >    public static void main(String[] args) throws IOException,
> >            InterruptedException, ClassNotFoundException {
> >
> >        props.load(new FileReader(new File(
> >                "/home/cyril/workspace/Newer/src/hadoop.properties")));
> >
> >        // TODO Auto-generated method stub
> >        FileSystem fs = null;
> >        SequenceFile.Writer writer;
> >        SequenceFile.Reader reader;
> >
> >        conf.set("fs.default.name", props.getProperty("fs.default.name
> "));
> >
> >        List<NamedVector> vectors = new LinkedList<NamedVector>();
> >        NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> 0.1,
> >                0.2, 0.5 }), "Hello");
> >        vectors.add(v1);
> >        v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> > }),
> >                "Bored");
> >        vectors.add(v1);
> >        v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> > }),
> >                "Done");
> >        vectors.add(v1);
> >        // Write the data to SequenceFile
> >        try {
> >            fs = FileSystem.get(conf);
> >
> >            Path path = new Path("testdata_seq/data");
> >            writer = new SequenceFile.Writer(fs, conf, path, Text.class,
> >                    VectorWritable.class);
> >
> >            VectorWritable vec = new VectorWritable();
> >            for (NamedVector vector : vectors) {
> >                vec.set(vector);
> >                writer.append(new Text(vector.getName()), vec);
> >            }
> >            writer.close();
> >
> >        } catch (Exception e) {
> >            System.out.println("ERROR: " + e);
> >        }
> >
> >        Path input = new Path("testdata_seq/data");
> >        boolean runSequential = false;
> >        Path clustersOut = new Path("testdata_seq/clusters");
> >        Path clustersIn = new
> > Path("testdata_seq/clusters/clusters-0-final");
> >        double convergenceDelta = 0;
> >        double clusterClassificationThreshold = 0;
> >        boolean runClustering = true;
> >        Path output = new Path("testdata_seq/output");
> >        int maxIterations = 12;
> >        CanopyDriver.run(conf, input, clustersOut, new
> > EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> > clusterClassificationThreshold, runSequential);
> >        KMeansDriver.run(conf, input, clustersIn, output, new
> > EuclideanDistanceMeasure(), convergenceDelta, maxIterations,
> runClustering,
> > clusterClassificationThreshold, runSequential);
> >
> >        reader = new SequenceFile.Reader(fs,
> >                new Path("testdata_seq/clusteredPoints/part-m-00000"),
> > conf);
> >
> >        IntWritable key = new IntWritable();
> >        WeightedVectorWritable value = new WeightedVectorWritable();
> >        while (reader.next(key, value)) {
> >          System.out.println(value.toString() + " belongs to cluster "
> >                             + key.toString());
> >        }
> >    }
> >
> > }
> >
> > Error Output:
> >
> > .......
> > 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> > PriviledgedActionException as:cyril
> > cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> > path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> > Exception in thread "main"
> > org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> > does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> >    at
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
> >    at
> >
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
> >    at
> >
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
> >    at
> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
> >    at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
> >    at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
> >    at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
> >    at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
> >    at java.security.AccessController.doPrivileged(Native Method)
> >    at javax.security.auth.Subject.doAs(Subject.java:416)
> >    at
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
> >    at
> > org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
> >    at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
> >    at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
> >    at
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
> >    at
> >
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
> >    at
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
> >    at
> >
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
> >    at DataFileWriter.main(DataFileWriter.java:85)
> >
> >
> >
> >
> > On another note. Is there a command that would allow the program to
> > overwrite existing files in the filesystem (I would get errors if I don't
> > delete the files before running the program again).
> >
> > Thank you for a reply and I hope I have given all the necessary output.
> In
> > the meantime I will look into it.
> >
> > Cyril
>
>

Re: FileSystem Error

Posted by Chris Harrington <ch...@heystaks.com>.

> security.UserGroupInformation:
> PriviledgedActionException as:cyril

I'm not entirely sure but sounds like a permissions issue to me. check all the files are owned by the user cyril and not root.
also did you start hadoop as root and run the program as cyril, hadoop might also complain about that

On 29 Mar 2013, at 15:54, Cyril Bogus wrote:

> Hi,
> 
> I am running a small java program that basically write a small input data
> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> then output the content of the data.
> 
> In my hadoop.properties I have included the core-site.xml definition for
> the Java program to connect to my single node setup so that I will not use
> the Java Project file system but hadoop instead (Basically all write and
> read are done on hadoop and not in the class file).
> 
> When I run the program, as soon as the Canopy (even the KMeans),
> configuration tries to lookup for the file in the class path instead of the
> Hadoop FileSystem path where the proper files are located.
> 
> Is there a problem with the way I have my conf defined?
> 
> hadoop.properties:
> fs.default.name=hdfs//mylocation
> 
> Program:
> 
> public class DataFileWriter {
> 
>    private static Properties props = new Properties();
>    private static Configuration conf = new Configuration();
> 
>    /**
>     * @param args
>     * @throws ClassNotFoundException
>     * @throws InterruptedException
>     * @throws IOException
>     */
>    public static void main(String[] args) throws IOException,
>            InterruptedException, ClassNotFoundException {
> 
>        props.load(new FileReader(new File(
>                "/home/cyril/workspace/Newer/src/hadoop.properties")));
> 
>        // TODO Auto-generated method stub
>        FileSystem fs = null;
>        SequenceFile.Writer writer;
>        SequenceFile.Reader reader;
> 
>        conf.set("fs.default.name", props.getProperty("fs.default.name"));
> 
>        List<NamedVector> vectors = new LinkedList<NamedVector>();
>        NamedVector v1 = new NamedVector(new DenseVector(new double[] { 0.1,
>                0.2, 0.5 }), "Hello");
>        vectors.add(v1);
>        v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> }),
>                "Bored");
>        vectors.add(v1);
>        v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> }),
>                "Done");
>        vectors.add(v1);
>        // Write the data to SequenceFile
>        try {
>            fs = FileSystem.get(conf);
> 
>            Path path = new Path("testdata_seq/data");
>            writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>                    VectorWritable.class);
> 
>            VectorWritable vec = new VectorWritable();
>            for (NamedVector vector : vectors) {
>                vec.set(vector);
>                writer.append(new Text(vector.getName()), vec);
>            }
>            writer.close();
> 
>        } catch (Exception e) {
>            System.out.println("ERROR: " + e);
>        }
> 
>        Path input = new Path("testdata_seq/data");
>        boolean runSequential = false;
>        Path clustersOut = new Path("testdata_seq/clusters");
>        Path clustersIn = new
> Path("testdata_seq/clusters/clusters-0-final");
>        double convergenceDelta = 0;
>        double clusterClassificationThreshold = 0;
>        boolean runClustering = true;
>        Path output = new Path("testdata_seq/output");
>        int maxIterations = 12;
>        CanopyDriver.run(conf, input, clustersOut, new
> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> clusterClassificationThreshold, runSequential);
>        KMeansDriver.run(conf, input, clustersIn, output, new
> EuclideanDistanceMeasure(), convergenceDelta, maxIterations, runClustering,
> clusterClassificationThreshold, runSequential);
> 
>        reader = new SequenceFile.Reader(fs,
>                new Path("testdata_seq/clusteredPoints/part-m-00000"),
> conf);
> 
>        IntWritable key = new IntWritable();
>        WeightedVectorWritable value = new WeightedVectorWritable();
>        while (reader.next(key, value)) {
>          System.out.println(value.toString() + " belongs to cluster "
>                             + key.toString());
>        }
>    }
> 
> }
> 
> Error Output:
> 
> .......
> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> PriviledgedActionException as:cyril
> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> Exception in thread "main"
> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>    at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>    at
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>    at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>    at org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>    at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>    at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>    at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>    at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>    at java.security.AccessController.doPrivileged(Native Method)
>    at javax.security.auth.Subject.doAs(Subject.java:416)
>    at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>    at
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>    at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>    at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>    at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>    at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>    at
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>    at
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>    at DataFileWriter.main(DataFileWriter.java:85)
> 
> 
> 
> 
> On another note. Is there a command that would allow the program to
> overwrite existing files in the filesystem (I would get errors if I don't
> delete the files before running the program again).
> 
> Thank you for a reply and I hope I have given all the necessary output. In
> the meantime I will look into it.
> 
> Cyril

Re: FileSystem Error

Posted by Azuryy Yu <az...@gmail.com>.

using  haddop jar, instead of java -jar.

hadoop script can set a proper classpath for you.
On Mar 29, 2013 11:55 PM, "Cyril Bogus" <cy...@gmail.com> wrote:

> Hi,
>
> I am running a small java program that basically write a small input data
> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> then output the content of the data.
>
> In my hadoop.properties I have included the core-site.xml definition for
> the Java program to connect to my single node setup so that I will not use
> the Java Project file system but hadoop instead (Basically all write and
> read are done on hadoop and not in the class file).
>
> When I run the program, as soon as the Canopy (even the KMeans),
> configuration tries to lookup for the file in the class path instead of the
> Hadoop FileSystem path where the proper files are located.
>
> Is there a problem with the way I have my conf defined?
>
> hadoop.properties:
> fs.default.name=hdfs//mylocation
>
> Program:
>
> public class DataFileWriter {
>
>     private static Properties props = new Properties();
>     private static Configuration conf = new Configuration();
>
>     /**
>      * @param args
>      * @throws ClassNotFoundException
>      * @throws InterruptedException
>      * @throws IOException
>      */
>     public static void main(String[] args) throws IOException,
>             InterruptedException, ClassNotFoundException {
>
>         props.load(new FileReader(new File(
>                 "/home/cyril/workspace/Newer/src/hadoop.properties")));
>
>         // TODO Auto-generated method stub
>         FileSystem fs = null;
>         SequenceFile.Writer writer;
>         SequenceFile.Reader reader;
>
>         conf.set("fs.default.name", props.getProperty("fs.default.name"));
>
>         List<NamedVector> vectors = new LinkedList<NamedVector>();
>         NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> 0.1,
>                 0.2, 0.5 }), "Hello");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> }),
>                 "Bored");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> }),
>                 "Done");
>         vectors.add(v1);
>         // Write the data to SequenceFile
>         try {
>             fs = FileSystem.get(conf);
>
>             Path path = new Path("testdata_seq/data");
>             writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>                     VectorWritable.class);
>
>             VectorWritable vec = new VectorWritable();
>             for (NamedVector vector : vectors) {
>                 vec.set(vector);
>                 writer.append(new Text(vector.getName()), vec);
>             }
>             writer.close();
>
>         } catch (Exception e) {
>             System.out.println("ERROR: " + e);
>         }
>
>         Path input = new Path("testdata_seq/data");
>         boolean runSequential = false;
>         Path clustersOut = new Path("testdata_seq/clusters");
>         Path clustersIn = new
> Path("testdata_seq/clusters/clusters-0-final");
>         double convergenceDelta = 0;
>         double clusterClassificationThreshold = 0;
>         boolean runClustering = true;
>         Path output = new Path("testdata_seq/output");
>         int maxIterations = 12;
>         CanopyDriver.run(conf, input, clustersOut, new
> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> clusterClassificationThreshold, runSequential);
>         KMeansDriver.run(conf, input, clustersIn, output, new
> EuclideanDistanceMeasure(), convergenceDelta, maxIterations, runClustering,
> clusterClassificationThreshold, runSequential);
>
>         reader = new SequenceFile.Reader(fs,
>                 new Path("testdata_seq/clusteredPoints/part-m-00000"),
> conf);
>
>         IntWritable key = new IntWritable();
>         WeightedVectorWritable value = new WeightedVectorWritable();
>         while (reader.next(key, value)) {
>           System.out.println(value.toString() + " belongs to cluster "
>                              + key.toString());
>         }
>     }
>
> }
>
> Error Output:
>
> .......
> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> PriviledgedActionException as:cyril
> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> Exception in thread "main"
> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>     at
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>     at
> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>     at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>     at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>     at java.security.AccessController.doPrivileged(Native Method)
>     at javax.security.auth.Subject.doAs(Subject.java:416)
>     at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>     at
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>     at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>     at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>     at DataFileWriter.main(DataFileWriter.java:85)
>
>
>
>
> On another note. Is there a command that would allow the program to
> overwrite existing files in the filesystem (I would get errors if I don't
> delete the files before running the program again).
>
> Thank you for a reply and I hope I have given all the necessary output. In
> the meantime I will look into it.
>
> Cyril
>

Re: FileSystem Error

Posted by Azuryy Yu <az...@gmail.com>.

using  haddop jar, instead of java -jar.

hadoop script can set a proper classpath for you.
On Mar 29, 2013 11:55 PM, "Cyril Bogus" <cy...@gmail.com> wrote:

> Hi,
>
> I am running a small java program that basically write a small input data
> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> then output the content of the data.
>
> In my hadoop.properties I have included the core-site.xml definition for
> the Java program to connect to my single node setup so that I will not use
> the Java Project file system but hadoop instead (Basically all write and
> read are done on hadoop and not in the class file).
>
> When I run the program, as soon as the Canopy (even the KMeans),
> configuration tries to lookup for the file in the class path instead of the
> Hadoop FileSystem path where the proper files are located.
>
> Is there a problem with the way I have my conf defined?
>
> hadoop.properties:
> fs.default.name=hdfs//mylocation
>
> Program:
>
> public class DataFileWriter {
>
>     private static Properties props = new Properties();
>     private static Configuration conf = new Configuration();
>
>     /**
>      * @param args
>      * @throws ClassNotFoundException
>      * @throws InterruptedException
>      * @throws IOException
>      */
>     public static void main(String[] args) throws IOException,
>             InterruptedException, ClassNotFoundException {
>
>         props.load(new FileReader(new File(
>                 "/home/cyril/workspace/Newer/src/hadoop.properties")));
>
>         // TODO Auto-generated method stub
>         FileSystem fs = null;
>         SequenceFile.Writer writer;
>         SequenceFile.Reader reader;
>
>         conf.set("fs.default.name", props.getProperty("fs.default.name"));
>
>         List<NamedVector> vectors = new LinkedList<NamedVector>();
>         NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> 0.1,
>                 0.2, 0.5 }), "Hello");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> }),
>                 "Bored");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> }),
>                 "Done");
>         vectors.add(v1);
>         // Write the data to SequenceFile
>         try {
>             fs = FileSystem.get(conf);
>
>             Path path = new Path("testdata_seq/data");
>             writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>                     VectorWritable.class);
>
>             VectorWritable vec = new VectorWritable();
>             for (NamedVector vector : vectors) {
>                 vec.set(vector);
>                 writer.append(new Text(vector.getName()), vec);
>             }
>             writer.close();
>
>         } catch (Exception e) {
>             System.out.println("ERROR: " + e);
>         }
>
>         Path input = new Path("testdata_seq/data");
>         boolean runSequential = false;
>         Path clustersOut = new Path("testdata_seq/clusters");
>         Path clustersIn = new
> Path("testdata_seq/clusters/clusters-0-final");
>         double convergenceDelta = 0;
>         double clusterClassificationThreshold = 0;
>         boolean runClustering = true;
>         Path output = new Path("testdata_seq/output");
>         int maxIterations = 12;
>         CanopyDriver.run(conf, input, clustersOut, new
> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> clusterClassificationThreshold, runSequential);
>         KMeansDriver.run(conf, input, clustersIn, output, new
> EuclideanDistanceMeasure(), convergenceDelta, maxIterations, runClustering,
> clusterClassificationThreshold, runSequential);
>
>         reader = new SequenceFile.Reader(fs,
>                 new Path("testdata_seq/clusteredPoints/part-m-00000"),
> conf);
>
>         IntWritable key = new IntWritable();
>         WeightedVectorWritable value = new WeightedVectorWritable();
>         while (reader.next(key, value)) {
>           System.out.println(value.toString() + " belongs to cluster "
>                              + key.toString());
>         }
>     }
>
> }
>
> Error Output:
>
> .......
> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> PriviledgedActionException as:cyril
> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> Exception in thread "main"
> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>     at
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>     at
> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>     at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>     at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>     at java.security.AccessController.doPrivileged(Native Method)
>     at javax.security.auth.Subject.doAs(Subject.java:416)
>     at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>     at
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>     at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>     at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>     at DataFileWriter.main(DataFileWriter.java:85)
>
>
>
>
> On another note. Is there a command that would allow the program to
> overwrite existing files in the filesystem (I would get errors if I don't
> delete the files before running the program again).
>
> Thank you for a reply and I hope I have given all the necessary output. In
> the meantime I will look into it.
>
> Cyril
>

Re: FileSystem Error

Posted by Azuryy Yu <az...@gmail.com>.

using  haddop jar, instead of java -jar.

hadoop script can set a proper classpath for you.
On Mar 29, 2013 11:55 PM, "Cyril Bogus" <cy...@gmail.com> wrote:

> Hi,
>
> I am running a small java program that basically write a small input data
> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> then output the content of the data.
>
> In my hadoop.properties I have included the core-site.xml definition for
> the Java program to connect to my single node setup so that I will not use
> the Java Project file system but hadoop instead (Basically all write and
> read are done on hadoop and not in the class file).
>
> When I run the program, as soon as the Canopy (even the KMeans),
> configuration tries to lookup for the file in the class path instead of the
> Hadoop FileSystem path where the proper files are located.
>
> Is there a problem with the way I have my conf defined?
>
> hadoop.properties:
> fs.default.name=hdfs//mylocation
>
> Program:
>
> public class DataFileWriter {
>
>     private static Properties props = new Properties();
>     private static Configuration conf = new Configuration();
>
>     /**
>      * @param args
>      * @throws ClassNotFoundException
>      * @throws InterruptedException
>      * @throws IOException
>      */
>     public static void main(String[] args) throws IOException,
>             InterruptedException, ClassNotFoundException {
>
>         props.load(new FileReader(new File(
>                 "/home/cyril/workspace/Newer/src/hadoop.properties")));
>
>         // TODO Auto-generated method stub
>         FileSystem fs = null;
>         SequenceFile.Writer writer;
>         SequenceFile.Reader reader;
>
>         conf.set("fs.default.name", props.getProperty("fs.default.name"));
>
>         List<NamedVector> vectors = new LinkedList<NamedVector>();
>         NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> 0.1,
>                 0.2, 0.5 }), "Hello");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> }),
>                 "Bored");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> }),
>                 "Done");
>         vectors.add(v1);
>         // Write the data to SequenceFile
>         try {
>             fs = FileSystem.get(conf);
>
>             Path path = new Path("testdata_seq/data");
>             writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>                     VectorWritable.class);
>
>             VectorWritable vec = new VectorWritable();
>             for (NamedVector vector : vectors) {
>                 vec.set(vector);
>                 writer.append(new Text(vector.getName()), vec);
>             }
>             writer.close();
>
>         } catch (Exception e) {
>             System.out.println("ERROR: " + e);
>         }
>
>         Path input = new Path("testdata_seq/data");
>         boolean runSequential = false;
>         Path clustersOut = new Path("testdata_seq/clusters");
>         Path clustersIn = new
> Path("testdata_seq/clusters/clusters-0-final");
>         double convergenceDelta = 0;
>         double clusterClassificationThreshold = 0;
>         boolean runClustering = true;
>         Path output = new Path("testdata_seq/output");
>         int maxIterations = 12;
>         CanopyDriver.run(conf, input, clustersOut, new
> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> clusterClassificationThreshold, runSequential);
>         KMeansDriver.run(conf, input, clustersIn, output, new
> EuclideanDistanceMeasure(), convergenceDelta, maxIterations, runClustering,
> clusterClassificationThreshold, runSequential);
>
>         reader = new SequenceFile.Reader(fs,
>                 new Path("testdata_seq/clusteredPoints/part-m-00000"),
> conf);
>
>         IntWritable key = new IntWritable();
>         WeightedVectorWritable value = new WeightedVectorWritable();
>         while (reader.next(key, value)) {
>           System.out.println(value.toString() + " belongs to cluster "
>                              + key.toString());
>         }
>     }
>
> }
>
> Error Output:
>
> .......
> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> PriviledgedActionException as:cyril
> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> Exception in thread "main"
> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>     at
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>     at
> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>     at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>     at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>     at java.security.AccessController.doPrivileged(Native Method)
>     at javax.security.auth.Subject.doAs(Subject.java:416)
>     at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>     at
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>     at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>     at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>     at DataFileWriter.main(DataFileWriter.java:85)
>
>
>
>
> On another note. Is there a command that would allow the program to
> overwrite existing files in the filesystem (I would get errors if I don't
> delete the files before running the program again).
>
> Thank you for a reply and I hope I have given all the necessary output. In
> the meantime I will look into it.
>
> Cyril
>

Re: FileSystem Error

Posted by Azuryy Yu <az...@gmail.com>.

using  haddop jar, instead of java -jar.

hadoop script can set a proper classpath for you.
On Mar 29, 2013 11:55 PM, "Cyril Bogus" <cy...@gmail.com> wrote:

> Hi,
>
> I am running a small java program that basically write a small input data
> to the Hadoop FileSystem, run a Mahout Canopy and Kmeans Clustering and
> then output the content of the data.
>
> In my hadoop.properties I have included the core-site.xml definition for
> the Java program to connect to my single node setup so that I will not use
> the Java Project file system but hadoop instead (Basically all write and
> read are done on hadoop and not in the class file).
>
> When I run the program, as soon as the Canopy (even the KMeans),
> configuration tries to lookup for the file in the class path instead of the
> Hadoop FileSystem path where the proper files are located.
>
> Is there a problem with the way I have my conf defined?
>
> hadoop.properties:
> fs.default.name=hdfs//mylocation
>
> Program:
>
> public class DataFileWriter {
>
>     private static Properties props = new Properties();
>     private static Configuration conf = new Configuration();
>
>     /**
>      * @param args
>      * @throws ClassNotFoundException
>      * @throws InterruptedException
>      * @throws IOException
>      */
>     public static void main(String[] args) throws IOException,
>             InterruptedException, ClassNotFoundException {
>
>         props.load(new FileReader(new File(
>                 "/home/cyril/workspace/Newer/src/hadoop.properties")));
>
>         // TODO Auto-generated method stub
>         FileSystem fs = null;
>         SequenceFile.Writer writer;
>         SequenceFile.Reader reader;
>
>         conf.set("fs.default.name", props.getProperty("fs.default.name"));
>
>         List<NamedVector> vectors = new LinkedList<NamedVector>();
>         NamedVector v1 = new NamedVector(new DenseVector(new double[] {
> 0.1,
>                 0.2, 0.5 }), "Hello");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.5, 0.1, 0.2
> }),
>                 "Bored");
>         vectors.add(v1);
>         v1 = new NamedVector(new DenseVector(new double[] { 0.2, 0.5, 0.1
> }),
>                 "Done");
>         vectors.add(v1);
>         // Write the data to SequenceFile
>         try {
>             fs = FileSystem.get(conf);
>
>             Path path = new Path("testdata_seq/data");
>             writer = new SequenceFile.Writer(fs, conf, path, Text.class,
>                     VectorWritable.class);
>
>             VectorWritable vec = new VectorWritable();
>             for (NamedVector vector : vectors) {
>                 vec.set(vector);
>                 writer.append(new Text(vector.getName()), vec);
>             }
>             writer.close();
>
>         } catch (Exception e) {
>             System.out.println("ERROR: " + e);
>         }
>
>         Path input = new Path("testdata_seq/data");
>         boolean runSequential = false;
>         Path clustersOut = new Path("testdata_seq/clusters");
>         Path clustersIn = new
> Path("testdata_seq/clusters/clusters-0-final");
>         double convergenceDelta = 0;
>         double clusterClassificationThreshold = 0;
>         boolean runClustering = true;
>         Path output = new Path("testdata_seq/output");
>         int maxIterations = 12;
>         CanopyDriver.run(conf, input, clustersOut, new
> EuclideanDistanceMeasure(), 1, 1, 1, 1, 0, runClustering,
> clusterClassificationThreshold, runSequential);
>         KMeansDriver.run(conf, input, clustersIn, output, new
> EuclideanDistanceMeasure(), convergenceDelta, maxIterations, runClustering,
> clusterClassificationThreshold, runSequential);
>
>         reader = new SequenceFile.Reader(fs,
>                 new Path("testdata_seq/clusteredPoints/part-m-00000"),
> conf);
>
>         IntWritable key = new IntWritable();
>         WeightedVectorWritable value = new WeightedVectorWritable();
>         while (reader.next(key, value)) {
>           System.out.println(value.toString() + " belongs to cluster "
>                              + key.toString());
>         }
>     }
>
> }
>
> Error Output:
>
> .......
> 13/03/29 11:47:15 ERROR security.UserGroupInformation:
> PriviledgedActionException as:cyril
> cause:org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input
> path does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
> Exception in thread "main"
> org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path
> does not exist: file:/home/cyril/workspace/Newer/testdata_seq/data
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:235)
>     at
> org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:55)
>     at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:252)
>     at
> org.apache.hadoop.mapred.JobClient.writeNewSplits(JobClient.java:962)
>     at org.apache.hadoop.mapred.JobClient.writeSplits(JobClient.java:979)
>     at org.apache.hadoop.mapred.JobClient.access$600(JobClient.java:174)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:897)
>     at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:850)
>     at java.security.AccessController.doPrivileged(Native Method)
>     at javax.security.auth.Subject.doAs(Subject.java:416)
>     at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
>     at
> org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:850)
>     at org.apache.hadoop.mapreduce.Job.submit(Job.java:500)
>     at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:530)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.classifyClusterMR(ClusterClassificationDriver.java:275)
>     at
> org.apache.mahout.clustering.classify.ClusterClassificationDriver.run(ClusterClassificationDriver.java:135)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.clusterData(CanopyDriver.java:372)
>     at
> org.apache.mahout.clustering.canopy.CanopyDriver.run(CanopyDriver.java:158)
>     at DataFileWriter.main(DataFileWriter.java:85)
>
>
>
>
> On another note. Is there a command that would allow the program to
> overwrite existing files in the filesystem (I would get errors if I don't
> delete the files before running the program again).
>
> Thank you for a reply and I hope I have given all the necessary output. In
> the meantime I will look into it.
>
> Cyril
>