You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/01/07 17:48:47 UTC

svn commit: r896922 [3/3] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/common/ core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ core/src/main/java/org/apache/mahout/fpm/pfpgrowth/convertors/ core/src/main/java/org/apache/mahout/fp...

Added: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleGrouper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleGrouper.java?rev=896922&view=auto
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleGrouper.java (added)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleGrouper.java Thu Jan  7 16:45:37 2010
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.example.dataset;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+
+public class KeyBasedStringTupleGrouper {
+
+
+  public void startJob(Parameters params) throws IOException,
+      InterruptedException, ClassNotFoundException {
+    Configuration conf = new Configuration();
+    
+    conf.set("job.parameters", params.toString());
+    conf.set("mapred.compress.map.output", "true");
+    conf.set("mapred.output.compression.type", "BLOCK");
+    conf.set("mapred.map.output.compression.codec",
+        "org.apache.hadoop.io.compress.GzipCodec");
+    conf.set("io.serializations",
+        "org.apache.hadoop.io.serializer.JavaSerialization,"
+            + "org.apache.hadoop.io.serializer.WritableSerialization");
+   
+    String input = params.get("input");
+    Job job = new Job(conf, "Generating dataset based from input" + input);
+    job.setJarByClass(KeyBasedStringTupleGrouper.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(StringTuple.class);
+    
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+
+    FileInputFormat.addInputPath(job, new Path(input));
+    Path outPath = new Path(params.get("output"));
+    FileOutputFormat.setOutputPath(job, outPath);
+    
+    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
+    if (dfs.exists(outPath)) {
+      dfs.delete(outPath, true);
+    }
+
+    job.setInputFormatClass(TextInputFormat.class);
+    job.setMapperClass(KeyBasedStringTupleMapper.class);
+    job.setCombinerClass(KeyBasedStringTupleCombiner.class);
+    job.setReducerClass(KeyBasedStringTupleReducer.class);
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.waitForCompletion(true);
+  }
+}

Added: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java?rev=896922&view=auto
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java (added)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/example/dataset/KeyBasedStringTupleMapper.java Thu Jan  7 16:45:37 2010
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.example.dataset;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class KeyBasedStringTupleMapper extends Mapper<LongWritable, Text, Text, StringTuple> {
+
+  private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
+
+  private Pattern splitter = null;
+
+  private int[] selectedFields = null;
+
+  private int[] groupingFields = null;
+
+  protected void map(LongWritable key, Text value, Context context) throws IOException,
+      InterruptedException {
+    String[] fields = splitter.split(value.toString());
+    if (fields.length != 4) {
+      log.info("{} {}", fields.length, value.toString());
+      context.getCounter("Map", "ERROR").increment(1);
+      return;
+    }
+    List<String> oKey = new ArrayList<String>();
+    for (int i = 0, groupingFieldCount = groupingFields.length; i < groupingFieldCount; i++) {
+      oKey.add(fields[groupingFields[i]]);
+      context.setStatus(fields[groupingFields[i]]);
+    }
+
+    List<String> oValue = new ArrayList<String>();
+    for (int i = 0, selectedFieldCount = selectedFields.length; i < selectedFieldCount; i++) {
+      oValue.add(fields[selectedFields[i]]);
+    }
+
+    context.write(new Text(oKey.toString()), new StringTuple(oValue));
+
+  }
+
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Parameters params = Parameters.fromString(context.getConfiguration().get("job.parameters", ""));
+    splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
+
+    int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
+    selectedFields = new int[selectedFieldCount];
+    for (int i = 0; i < selectedFieldCount; i++) {
+      selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
+    }
+
+    int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
+    groupingFields = new int[groupingFieldCount];
+    for (int i = 0; i < groupingFieldCount; i++) {
+      groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
+    }
+
+  }
+}