You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 13:14:31 UTC
[05/24] mahout git commit: MAHOUT-2034 Split MR and New Examples into
seperate modules
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
deleted file mode 100644
index 43beb78..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.fuzzykmeans;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Job extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
- private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
-
- private Job() {
- }
-
- public static void main(String[] args) throws Exception {
- if (args.length > 0) {
- log.info("Running with only user-supplied arguments");
- ToolRunner.run(new Configuration(), new Job(), args);
- } else {
- log.info("Running with default arguments");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, output);
- run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5);
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.convergenceOption().create());
- addOption(DefaultOptionCreator.maxIterationsOption().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(DefaultOptionCreator.t1Option().create());
- addOption(DefaultOptionCreator.t2Option().create());
- addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
-
- Map<String,List<String>> argMap = parseArguments(args);
- if (argMap == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- if (measureClass == null) {
- measureClass = SquaredEuclideanDistanceMeasure.class.getName();
- }
- double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
- int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- float fuzziness = Float.parseFloat(getOption(M_OPTION));
-
- addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
- .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
- .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION).create());
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
- DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
- double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
- double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta);
- return 0;
- }
-
- /**
- * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
- * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
- * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
- * containing synthetic_control.data as obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
- * and writes output to a directory named "output".
- *
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param t1
- * the canopy T1 threshold
- * @param t2
- * the canopy T2 threshold
- * @param maxIterations
- * the int maximum number of iterations
- * @param fuzziness
- * the float "m" fuzziness coefficient
- * @param convergenceDelta
- * the double convergence criteria for iterations
- */
- public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
- int maxIterations, float fuzziness, double convergenceDelta) throws Exception {
- Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
- log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- log.info("Running Canopy to get initial clusters");
- Path canopyOutput = new Path(output, "canopies");
- CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false);
- log.info("Running FuzzyKMeans");
- FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output,
- convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints"));
- clusterDumper.printClusters(null);
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
deleted file mode 100644
index 70c41fe..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.kmeans;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Job extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
- private Job() {
- }
-
- public static void main(String[] args) throws Exception {
- if (args.length > 0) {
- log.info("Running with only user-supplied arguments");
- ToolRunner.run(new Configuration(), new Job(), args);
- } else {
- log.info("Running with default arguments");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, output);
- run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.numClustersOption().create());
- addOption(DefaultOptionCreator.t1Option().create());
- addOption(DefaultOptionCreator.t2Option().create());
- addOption(DefaultOptionCreator.convergenceOption().create());
- addOption(DefaultOptionCreator.maxIterationsOption().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
-
- Map<String,List<String>> argMap = parseArguments(args);
- if (argMap == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- if (measureClass == null) {
- measureClass = SquaredEuclideanDistanceMeasure.class.getName();
- }
- double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
- int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
- DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
- if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
- int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
- run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
- } else {
- double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
- double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
- }
- return 0;
- }
-
- /**
- * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
- * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
- * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
- * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
- * directory named "output".
- *
- * @param conf
- * the Configuration to use
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param measure
- * the DistanceMeasure to use
- * @param k
- * the number of clusters in Kmeans
- * @param convergenceDelta
- * the double convergence criteria for iterations
- * @param maxIterations
- * the int maximum number of iterations
- */
- public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
- double convergenceDelta, int maxIterations) throws Exception {
- Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
- log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- log.info("Running random seed to get initial clusters");
- Path clusters = new Path(output, "random-seeds");
- clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
- log.info("Running KMeans with k = {}", k);
- KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta,
- maxIterations, true, 0.0, false);
- // run ClusterDumper
- Path outGlob = new Path(output, "clusters-*-final");
- Path clusteredPoints = new Path(output,"clusteredPoints");
- log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
- ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
- clusterDumper.printClusters(null);
- }
-
- /**
- * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
- * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
- * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
- * containing synthetic_control.data as obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
- * and writes output to a directory named "output".
- *
- * @param conf
- * the Configuration to use
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param measure
- * the DistanceMeasure to use
- * @param t1
- * the canopy T1 threshold
- * @param t2
- * the canopy T2 threshold
- * @param convergenceDelta
- * the double convergence criteria for iterations
- * @param maxIterations
- * the int maximum number of iterations
- */
- public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
- double convergenceDelta, int maxIterations) throws Exception {
- Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
- log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- log.info("Running Canopy to get initial clusters");
- Path canopyOutput = new Path(output, "canopies");
- CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0,
- false);
- log.info("Running KMeans");
- KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR
- + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output,
- "clusteredPoints"));
- clusterDumper.printClusters(null);
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
deleted file mode 100644
index 92363e5..0000000
--- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth;
-
-import java.io.IOException;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper;
-
-public final class DeliciousTagsExample {
- private DeliciousTagsExample() { }
-
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
- Option inputDirOpt = DefaultOptionCreator.inputOption().create();
-
- Option outputOpt = DefaultOptionCreator.outputOption().create();
-
- Option helpOpt = DefaultOptionCreator.helpOption();
- Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument(
- abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription(
- "Regular Expression pattern used to split given line into fields."
- + " Default value splits comma or tab separated fields."
- + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ").withShortName("regex").create();
- Option encodingOpt = obuilder.withLongName("encoding").withArgument(
- abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional) The file encoding. Default value: UTF-8").withShortName("e").create();
- Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputOpt).withOption(
- helpOpt).withOption(recordSplitterOpt).withOption(encodingOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
- Parameters params = new Parameters();
- if (cmdLine.hasOption(recordSplitterOpt)) {
- params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
- }
-
- String encoding = "UTF-8";
- if (cmdLine.hasOption(encodingOpt)) {
- encoding = (String) cmdLine.getValue(encodingOpt);
- }
- params.set("encoding", encoding);
- String inputDir = (String) cmdLine.getValue(inputDirOpt);
- String outputDir = (String) cmdLine.getValue(outputOpt);
- params.set("input", inputDir);
- params.set("output", outputDir);
- params.set("groupingFieldCount", "2");
- params.set("gfield0", "1");
- params.set("gfield1", "2");
- params.set("selectedFieldCount", "1");
- params.set("field0", "3");
- params.set("maxTransactionLength", "100");
- KeyBasedStringTupleGrouper.startJob(params);
-
- } catch (OptionException ex) {
- CommandLineUtil.printHelp(group);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
deleted file mode 100644
index 4c80a31..0000000
--- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.StringTuple;
-
-public class KeyBasedStringTupleCombiner extends Reducer<Text,StringTuple,Text,StringTuple> {
-
- @Override
- protected void reduce(Text key,
- Iterable<StringTuple> values,
- Context context) throws IOException, InterruptedException {
- Set<String> outputValues = new HashSet<>();
- for (StringTuple value : values) {
- outputValues.addAll(value.getEntries());
- }
- context.write(key, new StringTuple(outputValues));
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
deleted file mode 100644
index cd17770..0000000
--- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-
-public final class KeyBasedStringTupleGrouper {
-
- private KeyBasedStringTupleGrouper() { }
-
- public static void startJob(Parameters params) throws IOException,
- InterruptedException,
- ClassNotFoundException {
- Configuration conf = new Configuration();
-
- conf.set("job.parameters", params.toString());
- conf.set("mapred.compress.map.output", "true");
- conf.set("mapred.output.compression.type", "BLOCK");
- conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
- conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
-
- String input = params.get("input");
- Job job = new Job(conf, "Generating dataset based from input" + input);
- job.setJarByClass(KeyBasedStringTupleGrouper.class);
-
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(StringTuple.class);
-
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
-
- FileInputFormat.addInputPath(job, new Path(input));
- Path outPath = new Path(params.get("output"));
- FileOutputFormat.setOutputPath(job, outPath);
-
- HadoopUtil.delete(conf, outPath);
-
- job.setInputFormatClass(TextInputFormat.class);
- job.setMapperClass(KeyBasedStringTupleMapper.class);
- job.setCombinerClass(KeyBasedStringTupleCombiner.class);
- job.setReducerClass(KeyBasedStringTupleReducer.class);
- job.setOutputFormatClass(TextOutputFormat.class);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
deleted file mode 100644
index 362d1ce..0000000
--- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Splits the line using a {@link Pattern} and outputs key as given by the groupingFields
- *
- */
-public class KeyBasedStringTupleMapper extends Mapper<LongWritable,Text,Text,StringTuple> {
-
- private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
-
- private Pattern splitter;
-
- private int[] selectedFields;
-
- private int[] groupingFields;
-
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String[] fields = splitter.split(value.toString());
- if (fields.length != 4) {
- log.info("{} {}", fields.length, value.toString());
- context.getCounter("Map", "ERROR").increment(1);
- return;
- }
- Collection<String> oKey = new ArrayList<>();
- for (int groupingField : groupingFields) {
- oKey.add(fields[groupingField]);
- context.setStatus(fields[groupingField]);
- }
-
- List<String> oValue = new ArrayList<>();
- for (int selectedField : selectedFields) {
- oValue.add(fields[selectedField]);
- }
-
- context.write(new Text(oKey.toString()), new StringTuple(oValue));
-
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
- splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
-
- int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
- selectedFields = new int[selectedFieldCount];
- for (int i = 0; i < selectedFieldCount; i++) {
- selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
- }
-
- int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
- groupingFields = new int[groupingFieldCount];
- for (int i = 0; i < groupingFieldCount; i++) {
- groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java b/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
deleted file mode 100644
index a7ef762..0000000
--- a/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.HashSet;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-
-public class KeyBasedStringTupleReducer extends Reducer<Text,StringTuple,Text,Text> {
-
- private int maxTransactionLength = 100;
-
- @Override
- protected void reduce(Text key, Iterable<StringTuple> values, Context context)
- throws IOException, InterruptedException {
- Collection<String> items = new HashSet<>();
-
- for (StringTuple value : values) {
- for (String field : value.getEntries()) {
- items.add(field);
- }
- }
- if (items.size() > 1) {
- int i = 0;
- StringBuilder sb = new StringBuilder();
- String sep = "";
- for (String field : items) {
- if (i % maxTransactionLength == 0) {
- if (i != 0) {
- context.write(null, new Text(sb.toString()));
- }
- sb.replace(0, sb.length(), "");
- sep = "";
- }
-
- sb.append(sep).append(field);
- sep = "\t";
-
- i++;
-
- }
- if (sb.length() > 0) {
- context.write(null, new Text(sb.toString()));
- }
- }
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
- maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100"));
- }
-}